{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/ja-en.E18-D4/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:35163', 'distributed_port': 35163, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 8192, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 8192, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [8], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/ja-en.E18-D4', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/ja-en.E18-D4/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=8192, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=8192, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[8], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/ja-en.E18-D4', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/ja-en/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, encoder_layers=18, decoder_layers=4, dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_learned_pos=False, decoder_embed_path=None, decoder_normalize_before=False, decoder_learned_pos=False, attention_dropout=0.0, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/ja-en/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-17): 18 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-3): 4 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=16000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 527,710,208 (num. trained: 527,710,208) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 8192 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1707 loss=12.008, nll_loss=11.686, ppl=3294.08, wps=357810, ups=0.73, wpb=490364, bsz=16388.4, num_updates=100, lr=2.5e-05, gnorm=2.695, clip=78, loss_scale=4, train_wall=140, gb_free=60.4, wall=160 epoch 001: 201 / 1707 loss=10.477, nll_loss=9.927, ppl=973.35, wps=362076, ups=0.74, wpb=491185, bsz=16454.7, num_updates=200, lr=5e-05, gnorm=1.75, clip=93, loss_scale=4, train_wall=135, gb_free=60.7, wall=295 epoch 001: 301 / 1707 loss=9.801, nll_loss=9.124, ppl=558.1, wps=360515, ups=0.74, wpb=490043, bsz=16405.9, num_updates=300, lr=7.5e-05, gnorm=1.949, clip=100, loss_scale=8, train_wall=135, gb_free=60.6, wall=431 epoch 001: 402 / 1707 loss=9.256, nll_loss=8.479, ppl=356.74, wps=355897, ups=0.73, wpb=489100, bsz=16046.4, num_updates=400, lr=0.0001, gnorm=1.782, clip=99, loss_scale=4, train_wall=137, gb_free=60.6, wall=569 epoch 001: 502 / 1707 loss=8.855, nll_loss=8.002, ppl=256.37, wps=360599, ups=0.74, wpb=490358, bsz=16553.7, num_updates=500, lr=0.000125, gnorm=1.646, clip=100, loss_scale=4, train_wall=135, gb_free=60.3, wall=705 epoch 001: 602 / 1707 loss=8.551, nll_loss=7.643, ppl=199.93, wps=361150, ups=0.74, wpb=490641, bsz=16145.5, num_updates=600, lr=0.00015, gnorm=1.427, clip=96, loss_scale=4, train_wall=135, gb_free=60.5, wall=841 epoch 001: 702 / 1707 loss=8.283, nll_loss=7.33, ppl=160.91, wps=357855, ups=0.73, wpb=488793, bsz=16209.2, num_updates=700, lr=0.000175, gnorm=1.36, clip=91, loss_scale=8, train_wall=136, gb_free=60.3, wall=977 epoch 001: 803 / 1707 loss=8.003, nll_loss=7.005, ppl=128.45, wps=357157, ups=0.73, wpb=489709, bsz=16439.7, num_updates=800, lr=0.0002, gnorm=1.204, clip=80, loss_scale=4, train_wall=137, gb_free=60, wall=1114 epoch 001: 903 / 1707 loss=7.754, nll_loss=6.718, ppl=105.25, wps=363075, ups=0.74, wpb=490058, bsz=16276.1, num_updates=900, lr=0.000225, gnorm=1.054, clip=58, loss_scale=4, train_wall=134, gb_free=60.4, wall=1249 epoch 001: 1003 / 1707 loss=7.55, nll_loss=6.482, ppl=89.36, wps=363319, ups=0.74, wpb=490591, bsz=16361.5, num_updates=1000, lr=0.00025, gnorm=1.032, clip=57, loss_scale=8, train_wall=135, gb_free=60.7, wall=1384 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 7.493 | nll_loss 6.393 | ppl 84.01 | wps 218685 | wpb 22263 | bsz 1004 | num_updates 1000 epoch 001: 1103 / 1707 loss=7.368, nll_loss=6.272, ppl=77.27, wps=322298, ups=0.66, wpb=490523, bsz=16257.5, num_updates=1100, lr=0.000275, gnorm=0.988, clip=43, loss_scale=8, train_wall=133, gb_free=60.7, wall=1537 epoch 001: 1203 / 1707 loss=7.219, nll_loss=6.101, ppl=68.65, wps=364000, ups=0.74, wpb=490286, bsz=16216.5, num_updates=1200, lr=0.0003, gnorm=0.922, clip=27, loss_scale=8, train_wall=134, gb_free=60.6, wall=1671 epoch 001: 1304 / 1707 loss=7.054, nll_loss=5.911, ppl=60.16, wps=360723, ups=0.73, wpb=490962, bsz=16368.5, num_updates=1300, lr=0.000325, gnorm=0.903, clip=26, loss_scale=8, train_wall=136, gb_free=60.6, wall=1807 epoch 001: 1404 / 1707 loss=6.923, nll_loss=5.761, ppl=54.23, wps=364897, ups=0.74, wpb=491040, bsz=16388.5, num_updates=1400, lr=0.00035, gnorm=0.897, clip=24, loss_scale=8, train_wall=134, gb_free=61.1, wall=1942 epoch 001: 1504 / 1707 loss=6.8, nll_loss=5.62, ppl=49.17, wps=364675, ups=0.74, wpb=489580, bsz=16325, num_updates=1500, lr=0.000375, gnorm=0.86, clip=20, loss_scale=8, train_wall=134, gb_free=60.5, wall=2076 epoch 001: 1605 / 1707 loss=6.697, nll_loss=5.501, ppl=45.28, wps=359013, ups=0.73, wpb=489128, bsz=16283.5, num_updates=1600, lr=0.0004, gnorm=0.857, clip=14, loss_scale=8, train_wall=136, gb_free=60.6, wall=2212 epoch 001: 1705 / 1707 loss=6.583, nll_loss=5.369, ppl=41.32, wps=362632, ups=0.74, wpb=489728, bsz=16559.8, num_updates=1700, lr=0.000425, gnorm=0.831, clip=16, loss_scale=8, train_wall=135, gb_free=60.5, wall=2348 end of epoch 1 (average epoch stats below) epoch 001 | loss 8.186 | nll_loss 7.23 | ppl 150.14 | wps 358406 | ups 0.73 | wpb 489897 | bsz 16328.5 | num_updates 1702 | lr 0.0004255 | gnorm 1.303 | clip 60 | loss_scale 8 | train_wall 2303 | gb_free 60.5 | wall 2349 Start iterating over samples epoch 002: 99 / 1707 loss=6.498, nll_loss=5.271, ppl=38.61, wps=360030, ups=0.74, wpb=486849, bsz=16219.1, num_updates=1800, lr=0.00045, gnorm=0.83, clip=14, loss_scale=8, train_wall=134, gb_free=60.8, wall=2483 epoch 002: 99 / 1707 loss=6.498, nll_loss=5.271, ppl=38.61, wps=360030, ups=0.74, wpb=486849, bsz=16219.1, num_updates=1800, lr=0.00045, gnorm=0.83, clip=14, loss_scale=8, train_wall=134, gb_free=60.8, wall=2483 epoch 002: 199 / 1707 loss=6.403, nll_loss=5.163, ppl=35.83, wps=365181, ups=0.74, wpb=490189, bsz=16337.4, num_updates=1900, lr=0.000475, gnorm=0.789, clip=4, loss_scale=8, train_wall=134, gb_free=60.4, wall=2617 epoch 002: 199 / 1707 loss=6.403, nll_loss=5.163, ppl=35.83, wps=365181, ups=0.74, wpb=490189, bsz=16337.4, num_updates=1900, lr=0.000475, gnorm=0.789, clip=4, loss_scale=8, train_wall=134, gb_free=60.4, wall=2617 epoch 002: 300 / 1707 loss=6.328, nll_loss=5.077, ppl=33.76, wps=359862, ups=0.74, wpb=489604, bsz=16610.4, num_updates=2000, lr=0.0005, gnorm=0.78, clip=15, loss_scale=4, train_wall=136, gb_free=60.2, wall=2753 epoch 002: 300 / 1707 loss=6.328, nll_loss=5.077, ppl=33.76, wps=359862, ups=0.74, wpb=489604, bsz=16610.4, num_updates=2000, lr=0.0005, gnorm=0.78, clip=15, loss_scale=4, train_wall=136, gb_free=60.2, wall=2753 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 6.4 | nll_loss 5.128 | ppl 34.96 | wps 220261 | wpb 22263 | bsz 1004 | num_updates 2000 | best_loss 6.4 epoch 002 | valid on 'valid' subset | loss 6.4 | nll_loss 5.128 | ppl 34.96 | wps 220261 | wpb 22263 | bsz 1004 | num_updates 2000 | best_loss 6.4 epoch 002: 400 / 1707 loss=6.269, nll_loss=5.009, ppl=32.2, wps=320525, ups=0.65, wpb=490932, bsz=16539.1, num_updates=2100, lr=0.000525, gnorm=0.796, clip=10, loss_scale=4, train_wall=133, gb_free=60.2, wall=2906 epoch 002: 400 / 1707 loss=6.269, nll_loss=5.009, ppl=32.2, wps=320525, ups=0.65, wpb=490932, bsz=16539.1, num_updates=2100, lr=0.000525, gnorm=0.796, clip=10, loss_scale=4, train_wall=133, gb_free=60.2, wall=2906 epoch 002: 500 / 1707 loss=6.218, nll_loss=4.951, ppl=30.93, wps=363755, ups=0.74, wpb=489754, bsz=16355.1, num_updates=2200, lr=0.00055, gnorm=0.773, clip=9, loss_scale=4, train_wall=134, gb_free=60.4, wall=3041 epoch 002: 500 / 1707 loss=6.218, nll_loss=4.951, ppl=30.93, wps=363755, ups=0.74, wpb=489754, bsz=16355.1, num_updates=2200, lr=0.00055, gnorm=0.773, clip=9, loss_scale=4, train_wall=134, gb_free=60.4, wall=3041 epoch 002: 601 / 1707 loss=6.17, nll_loss=4.896, ppl=29.78, wps=362121, ups=0.74, wpb=490762, bsz=16305.1, num_updates=2300, lr=0.000575, gnorm=0.783, clip=12, loss_scale=4, train_wall=135, gb_free=60.4, wall=3176 epoch 002: 601 / 1707 loss=6.17, nll_loss=4.896, ppl=29.78, wps=362121, ups=0.74, wpb=490762, bsz=16305.1, num_updates=2300, lr=0.000575, gnorm=0.783, clip=12, loss_scale=4, train_wall=135, gb_free=60.4, wall=3176 epoch 002: 701 / 1707 loss=6.11, nll_loss=4.828, ppl=28.4, wps=364472, ups=0.74, wpb=489830, bsz=16382.5, num_updates=2400, lr=0.0006, gnorm=0.742, clip=8, loss_scale=4, train_wall=134, gb_free=60.6, wall=3311 epoch 002: 701 / 1707 loss=6.11, nll_loss=4.828, ppl=28.4, wps=364472, ups=0.74, wpb=489830, bsz=16382.5, num_updates=2400, lr=0.0006, gnorm=0.742, clip=8, loss_scale=4, train_wall=134, gb_free=60.6, wall=3311 epoch 002: 801 / 1707 loss=6.061, nll_loss=4.773, ppl=27.35, wps=364959, ups=0.74, wpb=490016, bsz=16429.2, num_updates=2500, lr=0.000625, gnorm=0.746, clip=6, loss_scale=4, train_wall=134, gb_free=60.2, wall=3445 epoch 002: 801 / 1707 loss=6.061, nll_loss=4.773, ppl=27.35, wps=364959, ups=0.74, wpb=490016, bsz=16429.2, num_updates=2500, lr=0.000625, gnorm=0.746, clip=6, loss_scale=4, train_wall=134, gb_free=60.2, wall=3445 epoch 002: 902 / 1707 loss=6.018, nll_loss=4.725, ppl=26.44, wps=361254, ups=0.74, wpb=489467, bsz=16243.5, num_updates=2600, lr=0.00065, gnorm=0.704, clip=2, loss_scale=4, train_wall=135, gb_free=60.4, wall=3581 epoch 002: 902 / 1707 loss=6.018, nll_loss=4.725, ppl=26.44, wps=361254, ups=0.74, wpb=489467, bsz=16243.5, num_updates=2600, lr=0.00065, gnorm=0.704, clip=2, loss_scale=4, train_wall=135, gb_free=60.4, wall=3581 epoch 002: 1002 / 1707 loss=6.03, nll_loss=4.74, ppl=26.73, wps=366791, ups=0.75, wpb=489743, bsz=16291, num_updates=2700, lr=0.000675, gnorm=0.754, clip=17, loss_scale=4, train_wall=133, gb_free=61.2, wall=3714 epoch 002: 1002 / 1707 loss=6.03, nll_loss=4.74, ppl=26.73, wps=366791, ups=0.75, wpb=489743, bsz=16291, num_updates=2700, lr=0.000675, gnorm=0.754, clip=17, loss_scale=4, train_wall=133, gb_free=61.2, wall=3714 epoch 002: 1102 / 1707 loss=5.955, nll_loss=4.656, ppl=25.21, wps=366152, ups=0.75, wpb=490708, bsz=16071.4, num_updates=2800, lr=0.0007, gnorm=0.703, clip=3, loss_scale=4, train_wall=134, gb_free=60.6, wall=3848 epoch 002: 1102 / 1707 loss=5.955, nll_loss=4.656, ppl=25.21, wps=366152, ups=0.75, wpb=490708, bsz=16071.4, num_updates=2800, lr=0.0007, gnorm=0.703, clip=3, loss_scale=4, train_wall=134, gb_free=60.6, wall=3848 epoch 002: 1203 / 1707 loss=5.938, nll_loss=4.638, ppl=24.89, wps=360556, ups=0.74, wpb=489517, bsz=16357, num_updates=2900, lr=0.000725, gnorm=0.726, clip=7, loss_scale=4, train_wall=135, gb_free=60.5, wall=3984 epoch 002: 1203 / 1707 loss=5.938, nll_loss=4.638, ppl=24.89, wps=360556, ups=0.74, wpb=489517, bsz=16357, num_updates=2900, lr=0.000725, gnorm=0.726, clip=7, loss_scale=4, train_wall=135, gb_free=60.5, wall=3984 epoch 002: 1303 / 1707 loss=5.932, nll_loss=4.632, ppl=24.79, wps=366349, ups=0.75, wpb=490211, bsz=16318.9, num_updates=3000, lr=0.00075, gnorm=0.715, clip=7, loss_scale=4, train_wall=133, gb_free=60.5, wall=4118 epoch 002: 1303 / 1707 loss=5.932, nll_loss=4.632, ppl=24.79, wps=366349, ups=0.75, wpb=490211, bsz=16318.9, num_updates=3000, lr=0.00075, gnorm=0.715, clip=7, loss_scale=4, train_wall=133, gb_free=60.5, wall=4118 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 6.135 | nll_loss 4.84 | ppl 28.64 | wps 219501 | wpb 22263 | bsz 1004 | num_updates 3000 | best_loss 6.135 epoch 002 | valid on 'valid' subset | loss 6.135 | nll_loss 4.84 | ppl 28.64 | wps 219501 | wpb 22263 | bsz 1004 | num_updates 3000 | best_loss 6.135 epoch 002: 1403 / 1707 loss=5.854, nll_loss=4.544, ppl=23.32, wps=325216, ups=0.66, wpb=489522, bsz=16368.5, num_updates=3100, lr=0.000775, gnorm=0.668, clip=2, loss_scale=4, train_wall=133, gb_free=60.4, wall=4268 epoch 002: 1403 / 1707 loss=5.854, nll_loss=4.544, ppl=23.32, wps=325216, ups=0.66, wpb=489522, bsz=16368.5, num_updates=3100, lr=0.000775, gnorm=0.668, clip=2, loss_scale=4, train_wall=133, gb_free=60.4, wall=4268 epoch 002: 1504 / 1707 loss=5.804, nll_loss=4.487, ppl=22.42, wps=363563, ups=0.74, wpb=491738, bsz=16353.9, num_updates=3200, lr=0.0008, gnorm=0.682, clip=0, loss_scale=4, train_wall=135, gb_free=60.2, wall=4403 epoch 002: 1504 / 1707 loss=5.804, nll_loss=4.487, ppl=22.42, wps=363563, ups=0.74, wpb=491738, bsz=16353.9, num_updates=3200, lr=0.0008, gnorm=0.682, clip=0, loss_scale=4, train_wall=135, gb_free=60.2, wall=4403 epoch 002: 1605 / 1707 loss=5.853, nll_loss=4.544, ppl=23.33, wps=363272, ups=0.74, wpb=489557, bsz=16300.3, num_updates=3300, lr=0.000825, gnorm=0.784, clip=12, loss_scale=2, train_wall=134, gb_free=60.8, wall=4538 epoch 002: 1605 / 1707 loss=5.853, nll_loss=4.544, ppl=23.33, wps=363272, ups=0.74, wpb=489557, bsz=16300.3, num_updates=3300, lr=0.000825, gnorm=0.784, clip=12, loss_scale=2, train_wall=134, gb_free=60.8, wall=4538 epoch 002: 1705 / 1707 loss=5.794, nll_loss=4.478, ppl=22.28, wps=365267, ups=0.75, wpb=489457, bsz=16123, num_updates=3400, lr=0.00085, gnorm=0.634, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=4672 epoch 002: 1705 / 1707 loss=5.794, nll_loss=4.478, ppl=22.28, wps=365267, ups=0.75, wpb=489457, bsz=16123, num_updates=3400, lr=0.00085, gnorm=0.634, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=4672 end of epoch 2 (average epoch stats below) epoch 002 | loss 6.072 | nll_loss 4.788 | ppl 27.63 | wps 358254 | ups 0.73 | wpb 489887 | bsz 16327.2 | num_updates 3402 | lr 0.0008505 | gnorm 0.741 | clip 7.6 | loss_scale 2 | train_wall 2280 | gb_free 61.8 | wall 4674 epoch 002 | loss 6.072 | nll_loss 4.788 | ppl 27.63 | wps 358254 | ups 0.73 | wpb 489887 | bsz 16327.2 | num_updates 3402 | lr 0.0008505 | gnorm 0.741 | clip 7.6 | loss_scale 2 | train_wall 2280 | gb_free 61.8 | wall 4674 Start iterating over samples epoch 003: 98 / 1707 loss=5.711, nll_loss=4.382, ppl=20.86, wps=364676, ups=0.75, wpb=486955, bsz=16302.6, num_updates=3500, lr=0.000875, gnorm=0.632, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=4806 epoch 003: 98 / 1707 loss=5.711, nll_loss=4.382, ppl=20.86, wps=364676, ups=0.75, wpb=486955, bsz=16302.6, num_updates=3500, lr=0.000875, gnorm=0.632, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=4806 epoch 003: 98 / 1707 loss=5.711, nll_loss=4.382, ppl=20.86, wps=364676, ups=0.75, wpb=486955, bsz=16302.6, num_updates=3500, lr=0.000875, gnorm=0.632, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=4806 epoch 003: 198 / 1707 loss=5.733, nll_loss=4.409, ppl=21.24, wps=366271, ups=0.75, wpb=491216, bsz=16487.7, num_updates=3600, lr=0.0009, gnorm=0.694, clip=9, loss_scale=4, train_wall=134, gb_free=60.7, wall=4940 epoch 003: 198 / 1707 loss=5.733, nll_loss=4.409, ppl=21.24, wps=366271, ups=0.75, wpb=491216, bsz=16487.7, num_updates=3600, lr=0.0009, gnorm=0.694, clip=9, loss_scale=4, train_wall=134, gb_free=60.7, wall=4940 epoch 003: 198 / 1707 loss=5.733, nll_loss=4.409, ppl=21.24, wps=366271, ups=0.75, wpb=491216, bsz=16487.7, num_updates=3600, lr=0.0009, gnorm=0.694, clip=9, loss_scale=4, train_wall=134, gb_free=60.7, wall=4940 epoch 003: 299 / 1707 loss=5.68, nll_loss=4.35, ppl=20.39, wps=360178, ups=0.74, wpb=488930, bsz=16569.4, num_updates=3700, lr=0.000925, gnorm=0.672, clip=1, loss_scale=2, train_wall=135, gb_free=60.7, wall=5076 epoch 003: 299 / 1707 loss=5.68, nll_loss=4.35, ppl=20.39, wps=360178, ups=0.74, wpb=488930, bsz=16569.4, num_updates=3700, lr=0.000925, gnorm=0.672, clip=1, loss_scale=2, train_wall=135, gb_free=60.7, wall=5076 epoch 003: 299 / 1707 loss=5.68, nll_loss=4.35, ppl=20.39, wps=360178, ups=0.74, wpb=488930, bsz=16569.4, num_updates=3700, lr=0.000925, gnorm=0.672, clip=1, loss_scale=2, train_wall=135, gb_free=60.7, wall=5076 epoch 003: 399 / 1707 loss=5.659, nll_loss=4.326, ppl=20.05, wps=366429, ups=0.75, wpb=490522, bsz=16417.4, num_updates=3800, lr=0.00095, gnorm=0.647, clip=2, loss_scale=2, train_wall=133, gb_free=60.4, wall=5209 epoch 003: 399 / 1707 loss=5.659, nll_loss=4.326, ppl=20.05, wps=366429, ups=0.75, wpb=490522, bsz=16417.4, num_updates=3800, lr=0.00095, gnorm=0.647, clip=2, loss_scale=2, train_wall=133, gb_free=60.4, wall=5209 epoch 003: 399 / 1707 loss=5.659, nll_loss=4.326, ppl=20.05, wps=366429, ups=0.75, wpb=490522, bsz=16417.4, num_updates=3800, lr=0.00095, gnorm=0.647, clip=2, loss_scale=2, train_wall=133, gb_free=60.4, wall=5209 epoch 003: 499 / 1707 loss=5.612, nll_loss=4.272, ppl=19.32, wps=365976, ups=0.75, wpb=490056, bsz=16580.2, num_updates=3900, lr=0.000975, gnorm=0.662, clip=1, loss_scale=4, train_wall=133, gb_free=60.7, wall=5343 epoch 003: 499 / 1707 loss=5.612, nll_loss=4.272, ppl=19.32, wps=365976, ups=0.75, wpb=490056, bsz=16580.2, num_updates=3900, lr=0.000975, gnorm=0.662, clip=1, loss_scale=4, train_wall=133, gb_free=60.7, wall=5343 epoch 003: 499 / 1707 loss=5.612, nll_loss=4.272, ppl=19.32, wps=365976, ups=0.75, wpb=490056, bsz=16580.2, num_updates=3900, lr=0.000975, gnorm=0.662, clip=1, loss_scale=4, train_wall=133, gb_free=60.7, wall=5343 epoch 003: 599 / 1707 loss=5.568, nll_loss=4.222, ppl=18.66, wps=366107, ups=0.75, wpb=489793, bsz=16289.4, num_updates=4000, lr=0.001, gnorm=0.619, clip=1, loss_scale=4, train_wall=133, gb_free=60.4, wall=5477 epoch 003: 599 / 1707 loss=5.568, nll_loss=4.222, ppl=18.66, wps=366107, ups=0.75, wpb=489793, bsz=16289.4, num_updates=4000, lr=0.001, gnorm=0.619, clip=1, loss_scale=4, train_wall=133, gb_free=60.4, wall=5477 epoch 003: 599 / 1707 loss=5.568, nll_loss=4.222, ppl=18.66, wps=366107, ups=0.75, wpb=489793, bsz=16289.4, num_updates=4000, lr=0.001, gnorm=0.619, clip=1, loss_scale=4, train_wall=133, gb_free=60.4, wall=5477 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 5.873 | nll_loss 4.564 | ppl 23.66 | wps 221979 | wpb 22263 | bsz 1004 | num_updates 4000 | best_loss 5.873 epoch 003 | valid on 'valid' subset | loss 5.873 | nll_loss 4.564 | ppl 23.66 | wps 221979 | wpb 22263 | bsz 1004 | num_updates 4000 | best_loss 5.873 epoch 003 | valid on 'valid' subset | loss 5.873 | nll_loss 4.564 | ppl 23.66 | wps 221979 | wpb 22263 | bsz 1004 | num_updates 4000 | best_loss 5.873 epoch 003: 699 / 1707 loss=5.543, nll_loss=4.195, ppl=18.31, wps=323230, ups=0.66, wpb=490319, bsz=16259.2, num_updates=4100, lr=0.00098773, gnorm=0.653, clip=1, loss_scale=4, train_wall=133, gb_free=60.5, wall=5629 epoch 003: 699 / 1707 loss=5.543, nll_loss=4.195, ppl=18.31, wps=323230, ups=0.66, wpb=490319, bsz=16259.2, num_updates=4100, lr=0.00098773, gnorm=0.653, clip=1, loss_scale=4, train_wall=133, gb_free=60.5, wall=5629 epoch 003: 699 / 1707 loss=5.543, nll_loss=4.195, ppl=18.31, wps=323230, ups=0.66, wpb=490319, bsz=16259.2, num_updates=4100, lr=0.00098773, gnorm=0.653, clip=1, loss_scale=4, train_wall=133, gb_free=60.5, wall=5629 epoch 003: 799 / 1707 loss=5.492, nll_loss=4.135, ppl=17.57, wps=365732, ups=0.75, wpb=489505, bsz=15963.8, num_updates=4200, lr=0.0009759, gnorm=0.596, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=5763 epoch 003: 799 / 1707 loss=5.492, nll_loss=4.135, ppl=17.57, wps=365732, ups=0.75, wpb=489505, bsz=15963.8, num_updates=4200, lr=0.0009759, gnorm=0.596, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=5763 epoch 003: 799 / 1707 loss=5.492, nll_loss=4.135, ppl=17.57, wps=365732, ups=0.75, wpb=489505, bsz=15963.8, num_updates=4200, lr=0.0009759, gnorm=0.596, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=5763 epoch 003: 900 / 1707 loss=5.452, nll_loss=4.091, ppl=17.04, wps=361766, ups=0.74, wpb=489451, bsz=16254.8, num_updates=4300, lr=0.000964486, gnorm=0.591, clip=1, loss_scale=4, train_wall=135, gb_free=60.4, wall=5898 epoch 003: 900 / 1707 loss=5.452, nll_loss=4.091, ppl=17.04, wps=361766, ups=0.74, wpb=489451, bsz=16254.8, num_updates=4300, lr=0.000964486, gnorm=0.591, clip=1, loss_scale=4, train_wall=135, gb_free=60.4, wall=5898 epoch 003: 900 / 1707 loss=5.452, nll_loss=4.091, ppl=17.04, wps=361766, ups=0.74, wpb=489451, bsz=16254.8, num_updates=4300, lr=0.000964486, gnorm=0.591, clip=1, loss_scale=4, train_wall=135, gb_free=60.4, wall=5898 epoch 003: 1000 / 1707 loss=5.41, nll_loss=4.043, ppl=16.49, wps=365153, ups=0.75, wpb=489481, bsz=16090, num_updates=4400, lr=0.000953463, gnorm=0.59, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=6032 epoch 003: 1000 / 1707 loss=5.41, nll_loss=4.043, ppl=16.49, wps=365153, ups=0.75, wpb=489481, bsz=16090, num_updates=4400, lr=0.000953463, gnorm=0.59, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=6032 epoch 003: 1000 / 1707 loss=5.41, nll_loss=4.043, ppl=16.49, wps=365153, ups=0.75, wpb=489481, bsz=16090, num_updates=4400, lr=0.000953463, gnorm=0.59, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=6032 epoch 003: 1100 / 1707 loss=5.363, nll_loss=3.99, ppl=15.89, wps=366908, ups=0.75, wpb=490670, bsz=16236.6, num_updates=4500, lr=0.000942809, gnorm=0.564, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=6166 epoch 003: 1100 / 1707 loss=5.363, nll_loss=3.99, ppl=15.89, wps=366908, ups=0.75, wpb=490670, bsz=16236.6, num_updates=4500, lr=0.000942809, gnorm=0.564, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=6166 epoch 003: 1100 / 1707 loss=5.363, nll_loss=3.99, ppl=15.89, wps=366908, ups=0.75, wpb=490670, bsz=16236.6, num_updates=4500, lr=0.000942809, gnorm=0.564, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=6166 epoch 003: 1201 / 1707 loss=5.331, nll_loss=3.953, ppl=15.49, wps=362686, ups=0.74, wpb=490918, bsz=16217.6, num_updates=4600, lr=0.000932505, gnorm=0.58, clip=1, loss_scale=4, train_wall=135, gb_free=60.8, wall=6301 epoch 003: 1201 / 1707 loss=5.331, nll_loss=3.953, ppl=15.49, wps=362686, ups=0.74, wpb=490918, bsz=16217.6, num_updates=4600, lr=0.000932505, gnorm=0.58, clip=1, loss_scale=4, train_wall=135, gb_free=60.8, wall=6301 epoch 003: 1201 / 1707 loss=5.331, nll_loss=3.953, ppl=15.49, wps=362686, ups=0.74, wpb=490918, bsz=16217.6, num_updates=4600, lr=0.000932505, gnorm=0.58, clip=1, loss_scale=4, train_wall=135, gb_free=60.8, wall=6301 epoch 003: 1301 / 1707 loss=5.284, nll_loss=3.9, ppl=14.93, wps=367436, ups=0.75, wpb=490728, bsz=16371.4, num_updates=4700, lr=0.000922531, gnorm=0.556, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=6435 epoch 003: 1301 / 1707 loss=5.284, nll_loss=3.9, ppl=14.93, wps=367436, ups=0.75, wpb=490728, bsz=16371.4, num_updates=4700, lr=0.000922531, gnorm=0.556, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=6435 epoch 003: 1301 / 1707 loss=5.284, nll_loss=3.9, ppl=14.93, wps=367436, ups=0.75, wpb=490728, bsz=16371.4, num_updates=4700, lr=0.000922531, gnorm=0.556, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=6435 epoch 003: 1401 / 1707 loss=5.253, nll_loss=3.866, ppl=14.58, wps=365495, ups=0.75, wpb=489396, bsz=16340.3, num_updates=4800, lr=0.000912871, gnorm=0.561, clip=1, loss_scale=8, train_wall=133, gb_free=60.3, wall=6569 epoch 003: 1401 / 1707 loss=5.253, nll_loss=3.866, ppl=14.58, wps=365495, ups=0.75, wpb=489396, bsz=16340.3, num_updates=4800, lr=0.000912871, gnorm=0.561, clip=1, loss_scale=8, train_wall=133, gb_free=60.3, wall=6569 epoch 003: 1401 / 1707 loss=5.253, nll_loss=3.866, ppl=14.58, wps=365495, ups=0.75, wpb=489396, bsz=16340.3, num_updates=4800, lr=0.000912871, gnorm=0.561, clip=1, loss_scale=8, train_wall=133, gb_free=60.3, wall=6569 epoch 003: 1502 / 1707 loss=5.216, nll_loss=3.824, ppl=14.17, wps=361908, ups=0.74, wpb=490086, bsz=16399.2, num_updates=4900, lr=0.000903508, gnorm=0.547, clip=1, loss_scale=4, train_wall=135, gb_free=60.4, wall=6704 epoch 003: 1502 / 1707 loss=5.216, nll_loss=3.824, ppl=14.17, wps=361908, ups=0.74, wpb=490086, bsz=16399.2, num_updates=4900, lr=0.000903508, gnorm=0.547, clip=1, loss_scale=4, train_wall=135, gb_free=60.4, wall=6704 epoch 003: 1502 / 1707 loss=5.216, nll_loss=3.824, ppl=14.17, wps=361908, ups=0.74, wpb=490086, bsz=16399.2, num_updates=4900, lr=0.000903508, gnorm=0.547, clip=1, loss_scale=4, train_wall=135, gb_free=60.4, wall=6704 epoch 003: 1602 / 1707 loss=5.178, nll_loss=3.782, ppl=13.75, wps=367955, ups=0.75, wpb=491427, bsz=16488.6, num_updates=5000, lr=0.000894427, gnorm=0.54, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=6838 epoch 003: 1602 / 1707 loss=5.178, nll_loss=3.782, ppl=13.75, wps=367955, ups=0.75, wpb=491427, bsz=16488.6, num_updates=5000, lr=0.000894427, gnorm=0.54, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=6838 epoch 003: 1602 / 1707 loss=5.178, nll_loss=3.782, ppl=13.75, wps=367955, ups=0.75, wpb=491427, bsz=16488.6, num_updates=5000, lr=0.000894427, gnorm=0.54, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=6838 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 5.33 | nll_loss 3.929 | ppl 15.23 | wps 222375 | wpb 22263 | bsz 1004 | num_updates 5000 | best_loss 5.33 epoch 003 | valid on 'valid' subset | loss 5.33 | nll_loss 3.929 | ppl 15.23 | wps 222375 | wpb 22263 | bsz 1004 | num_updates 5000 | best_loss 5.33 epoch 003 | valid on 'valid' subset | loss 5.33 | nll_loss 3.929 | ppl 15.23 | wps 222375 | wpb 22263 | bsz 1004 | num_updates 5000 | best_loss 5.33 epoch 003: 1703 / 1707 loss=5.161, nll_loss=3.763, ppl=13.57, wps=319360, ups=0.65, wpb=488736, bsz=16336.4, num_updates=5100, lr=0.000885615, gnorm=0.575, clip=2, loss_scale=4, train_wall=134, gb_free=60.7, wall=6991 epoch 003: 1703 / 1707 loss=5.161, nll_loss=3.763, ppl=13.57, wps=319360, ups=0.65, wpb=488736, bsz=16336.4, num_updates=5100, lr=0.000885615, gnorm=0.575, clip=2, loss_scale=4, train_wall=134, gb_free=60.7, wall=6991 epoch 003: 1703 / 1707 loss=5.161, nll_loss=3.763, ppl=13.57, wps=319360, ups=0.65, wpb=488736, bsz=16336.4, num_updates=5100, lr=0.000885615, gnorm=0.575, clip=2, loss_scale=4, train_wall=134, gb_free=60.7, wall=6991 end of epoch 3 (average epoch stats below) epoch 003 | loss 5.449 | nll_loss 4.087 | ppl 17 | wps 359218 | ups 0.73 | wpb 489879 | bsz 16330.6 | num_updates 5104 | lr 0.000885268 | gnorm 0.605 | clip 1.2 | loss_scale 4 | train_wall 2276 | gb_free 61.6 | wall 6995 epoch 003 | loss 5.449 | nll_loss 4.087 | ppl 17 | wps 359218 | ups 0.73 | wpb 489879 | bsz 16330.6 | num_updates 5104 | lr 0.000885268 | gnorm 0.605 | clip 1.2 | loss_scale 4 | train_wall 2276 | gb_free 61.6 | wall 6995 epoch 003 | loss 5.449 | nll_loss 4.087 | ppl 17 | wps 359218 | ups 0.73 | wpb 489879 | bsz 16330.6 | num_updates 5104 | lr 0.000885268 | gnorm 0.605 | clip 1.2 | loss_scale 4 | train_wall 2276 | gb_free 61.6 | wall 6995 Start iterating over samples epoch 004: 96 / 1707 loss=4.986, nll_loss=3.563, ppl=11.81, wps=366306, ups=0.75, wpb=486058, bsz=16346.5, num_updates=5200, lr=0.000877058, gnorm=0.656, clip=9, loss_scale=4, train_wall=132, gb_free=60.9, wall=7123 epoch 004: 96 / 1707 loss=4.986, nll_loss=3.563, ppl=11.81, wps=366306, ups=0.75, wpb=486058, bsz=16346.5, num_updates=5200, lr=0.000877058, gnorm=0.656, clip=9, loss_scale=4, train_wall=132, gb_free=60.9, wall=7123 epoch 004: 96 / 1707 loss=4.986, nll_loss=3.563, ppl=11.81, wps=366306, ups=0.75, wpb=486058, bsz=16346.5, num_updates=5200, lr=0.000877058, gnorm=0.656, clip=9, loss_scale=4, train_wall=132, gb_free=60.9, wall=7123 epoch 004: 96 / 1707 loss=4.986, nll_loss=3.563, ppl=11.81, wps=366306, ups=0.75, wpb=486058, bsz=16346.5, num_updates=5200, lr=0.000877058, gnorm=0.656, clip=9, loss_scale=4, train_wall=132, gb_free=60.9, wall=7123 epoch 004: 196 / 1707 loss=4.64, nll_loss=3.176, ppl=9.04, wps=367041, ups=0.75, wpb=490708, bsz=16206, num_updates=5300, lr=0.000868744, gnorm=0.61, clip=7, loss_scale=4, train_wall=133, gb_free=61.1, wall=7257 epoch 004: 196 / 1707 loss=4.64, nll_loss=3.176, ppl=9.04, wps=367041, ups=0.75, wpb=490708, bsz=16206, num_updates=5300, lr=0.000868744, gnorm=0.61, clip=7, loss_scale=4, train_wall=133, gb_free=61.1, wall=7257 epoch 004: 196 / 1707 loss=4.64, nll_loss=3.176, ppl=9.04, wps=367041, ups=0.75, wpb=490708, bsz=16206, num_updates=5300, lr=0.000868744, gnorm=0.61, clip=7, loss_scale=4, train_wall=133, gb_free=61.1, wall=7257 epoch 004: 196 / 1707 loss=4.64, nll_loss=3.176, ppl=9.04, wps=367041, ups=0.75, wpb=490708, bsz=16206, num_updates=5300, lr=0.000868744, gnorm=0.61, clip=7, loss_scale=4, train_wall=133, gb_free=61.1, wall=7257 epoch 004: 297 / 1707 loss=4.472, nll_loss=2.991, ppl=7.95, wps=364441, ups=0.74, wpb=490676, bsz=16132.6, num_updates=5400, lr=0.000860663, gnorm=0.529, clip=2, loss_scale=4, train_wall=134, gb_free=60.5, wall=7392 epoch 004: 297 / 1707 loss=4.472, nll_loss=2.991, ppl=7.95, wps=364441, ups=0.74, wpb=490676, bsz=16132.6, num_updates=5400, lr=0.000860663, gnorm=0.529, clip=2, loss_scale=4, train_wall=134, gb_free=60.5, wall=7392 epoch 004: 297 / 1707 loss=4.472, nll_loss=2.991, ppl=7.95, wps=364441, ups=0.74, wpb=490676, bsz=16132.6, num_updates=5400, lr=0.000860663, gnorm=0.529, clip=2, loss_scale=4, train_wall=134, gb_free=60.5, wall=7392 epoch 004: 297 / 1707 loss=4.472, nll_loss=2.991, ppl=7.95, wps=364441, ups=0.74, wpb=490676, bsz=16132.6, num_updates=5400, lr=0.000860663, gnorm=0.529, clip=2, loss_scale=4, train_wall=134, gb_free=60.5, wall=7392 epoch 004: 397 / 1707 loss=4.373, nll_loss=2.883, ppl=7.38, wps=366118, ups=0.75, wpb=489901, bsz=16531.2, num_updates=5500, lr=0.000852803, gnorm=0.494, clip=2, loss_scale=4, train_wall=133, gb_free=60.4, wall=7525 epoch 004: 397 / 1707 loss=4.373, nll_loss=2.883, ppl=7.38, wps=366118, ups=0.75, wpb=489901, bsz=16531.2, num_updates=5500, lr=0.000852803, gnorm=0.494, clip=2, loss_scale=4, train_wall=133, gb_free=60.4, wall=7525 epoch 004: 397 / 1707 loss=4.373, nll_loss=2.883, ppl=7.38, wps=366118, ups=0.75, wpb=489901, bsz=16531.2, num_updates=5500, lr=0.000852803, gnorm=0.494, clip=2, loss_scale=4, train_wall=133, gb_free=60.4, wall=7525 epoch 004: 397 / 1707 loss=4.373, nll_loss=2.883, ppl=7.38, wps=366118, ups=0.75, wpb=489901, bsz=16531.2, num_updates=5500, lr=0.000852803, gnorm=0.494, clip=2, loss_scale=4, train_wall=133, gb_free=60.4, wall=7525 epoch 004: 497 / 1707 loss=4.329, nll_loss=2.838, ppl=7.15, wps=366191, ups=0.75, wpb=489472, bsz=16203, num_updates=5600, lr=0.000845154, gnorm=0.46, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=7659 epoch 004: 497 / 1707 loss=4.329, nll_loss=2.838, ppl=7.15, wps=366191, ups=0.75, wpb=489472, bsz=16203, num_updates=5600, lr=0.000845154, gnorm=0.46, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=7659 epoch 004: 497 / 1707 loss=4.329, nll_loss=2.838, ppl=7.15, wps=366191, ups=0.75, wpb=489472, bsz=16203, num_updates=5600, lr=0.000845154, gnorm=0.46, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=7659 epoch 004: 497 / 1707 loss=4.329, nll_loss=2.838, ppl=7.15, wps=366191, ups=0.75, wpb=489472, bsz=16203, num_updates=5600, lr=0.000845154, gnorm=0.46, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=7659 epoch 004: 597 / 1707 loss=4.277, nll_loss=2.781, ppl=6.87, wps=365687, ups=0.75, wpb=489486, bsz=16299.5, num_updates=5700, lr=0.000837708, gnorm=0.433, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=7793 epoch 004: 597 / 1707 loss=4.277, nll_loss=2.781, ppl=6.87, wps=365687, ups=0.75, wpb=489486, bsz=16299.5, num_updates=5700, lr=0.000837708, gnorm=0.433, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=7793 epoch 004: 597 / 1707 loss=4.277, nll_loss=2.781, ppl=6.87, wps=365687, ups=0.75, wpb=489486, bsz=16299.5, num_updates=5700, lr=0.000837708, gnorm=0.433, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=7793 epoch 004: 597 / 1707 loss=4.277, nll_loss=2.781, ppl=6.87, wps=365687, ups=0.75, wpb=489486, bsz=16299.5, num_updates=5700, lr=0.000837708, gnorm=0.433, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=7793 epoch 004: 698 / 1707 loss=4.237, nll_loss=2.738, ppl=6.67, wps=361063, ups=0.74, wpb=489370, bsz=16376.1, num_updates=5800, lr=0.000830455, gnorm=0.424, clip=1, loss_scale=4, train_wall=135, gb_free=60.7, wall=7929 epoch 004: 698 / 1707 loss=4.237, nll_loss=2.738, ppl=6.67, wps=361063, ups=0.74, wpb=489370, bsz=16376.1, num_updates=5800, lr=0.000830455, gnorm=0.424, clip=1, loss_scale=4, train_wall=135, gb_free=60.7, wall=7929 epoch 004: 698 / 1707 loss=4.237, nll_loss=2.738, ppl=6.67, wps=361063, ups=0.74, wpb=489370, bsz=16376.1, num_updates=5800, lr=0.000830455, gnorm=0.424, clip=1, loss_scale=4, train_wall=135, gb_free=60.7, wall=7929 epoch 004: 698 / 1707 loss=4.237, nll_loss=2.738, ppl=6.67, wps=361063, ups=0.74, wpb=489370, bsz=16376.1, num_updates=5800, lr=0.000830455, gnorm=0.424, clip=1, loss_scale=4, train_wall=135, gb_free=60.7, wall=7929 epoch 004: 798 / 1707 loss=4.202, nll_loss=2.701, ppl=6.5, wps=365488, ups=0.75, wpb=489523, bsz=16408.4, num_updates=5900, lr=0.000823387, gnorm=0.41, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=8062 epoch 004: 798 / 1707 loss=4.202, nll_loss=2.701, ppl=6.5, wps=365488, ups=0.75, wpb=489523, bsz=16408.4, num_updates=5900, lr=0.000823387, gnorm=0.41, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=8062 epoch 004: 798 / 1707 loss=4.202, nll_loss=2.701, ppl=6.5, wps=365488, ups=0.75, wpb=489523, bsz=16408.4, num_updates=5900, lr=0.000823387, gnorm=0.41, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=8062 epoch 004: 798 / 1707 loss=4.202, nll_loss=2.701, ppl=6.5, wps=365488, ups=0.75, wpb=489523, bsz=16408.4, num_updates=5900, lr=0.000823387, gnorm=0.41, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=8062 epoch 004: 898 / 1707 loss=4.176, nll_loss=2.673, ppl=6.38, wps=367256, ups=0.75, wpb=491271, bsz=16605.7, num_updates=6000, lr=0.000816497, gnorm=0.408, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=8196 epoch 004: 898 / 1707 loss=4.176, nll_loss=2.673, ppl=6.38, wps=367256, ups=0.75, wpb=491271, bsz=16605.7, num_updates=6000, lr=0.000816497, gnorm=0.408, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=8196 epoch 004: 898 / 1707 loss=4.176, nll_loss=2.673, ppl=6.38, wps=367256, ups=0.75, wpb=491271, bsz=16605.7, num_updates=6000, lr=0.000816497, gnorm=0.408, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=8196 epoch 004: 898 / 1707 loss=4.176, nll_loss=2.673, ppl=6.38, wps=367256, ups=0.75, wpb=491271, bsz=16605.7, num_updates=6000, lr=0.000816497, gnorm=0.408, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=8196 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.275 | nll_loss 2.756 | ppl 6.76 | wps 222483 | wpb 22263 | bsz 1004 | num_updates 6000 | best_loss 4.275 epoch 004 | valid on 'valid' subset | loss 4.275 | nll_loss 2.756 | ppl 6.76 | wps 222483 | wpb 22263 | bsz 1004 | num_updates 6000 | best_loss 4.275 epoch 004 | valid on 'valid' subset | loss 4.275 | nll_loss 2.756 | ppl 6.76 | wps 222483 | wpb 22263 | bsz 1004 | num_updates 6000 | best_loss 4.275 epoch 004 | valid on 'valid' subset | loss 4.275 | nll_loss 2.756 | ppl 6.76 | wps 222483 | wpb 22263 | bsz 1004 | num_updates 6000 | best_loss 4.275 epoch 004: 999 / 1707 loss=4.15, nll_loss=2.645, ppl=6.26, wps=313143, ups=0.64, wpb=490423, bsz=16230.4, num_updates=6100, lr=0.000809776, gnorm=0.408, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=8353 epoch 004: 999 / 1707 loss=4.15, nll_loss=2.645, ppl=6.26, wps=313143, ups=0.64, wpb=490423, bsz=16230.4, num_updates=6100, lr=0.000809776, gnorm=0.408, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=8353 epoch 004: 999 / 1707 loss=4.15, nll_loss=2.645, ppl=6.26, wps=313143, ups=0.64, wpb=490423, bsz=16230.4, num_updates=6100, lr=0.000809776, gnorm=0.408, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=8353 epoch 004: 999 / 1707 loss=4.15, nll_loss=2.645, ppl=6.26, wps=313143, ups=0.64, wpb=490423, bsz=16230.4, num_updates=6100, lr=0.000809776, gnorm=0.408, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=8353 epoch 004: 1099 / 1707 loss=4.13, nll_loss=2.624, ppl=6.17, wps=365948, ups=0.75, wpb=490461, bsz=16558.7, num_updates=6200, lr=0.000803219, gnorm=0.377, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=8487 epoch 004: 1099 / 1707 loss=4.13, nll_loss=2.624, ppl=6.17, wps=365948, ups=0.75, wpb=490461, bsz=16558.7, num_updates=6200, lr=0.000803219, gnorm=0.377, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=8487 epoch 004: 1099 / 1707 loss=4.13, nll_loss=2.624, ppl=6.17, wps=365948, ups=0.75, wpb=490461, bsz=16558.7, num_updates=6200, lr=0.000803219, gnorm=0.377, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=8487 epoch 004: 1099 / 1707 loss=4.13, nll_loss=2.624, ppl=6.17, wps=365948, ups=0.75, wpb=490461, bsz=16558.7, num_updates=6200, lr=0.000803219, gnorm=0.377, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=8487 epoch 004: 1199 / 1707 loss=4.117, nll_loss=2.61, ppl=6.11, wps=365467, ups=0.75, wpb=489826, bsz=16222.8, num_updates=6300, lr=0.000796819, gnorm=0.384, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=8621 epoch 004: 1199 / 1707 loss=4.117, nll_loss=2.61, ppl=6.11, wps=365467, ups=0.75, wpb=489826, bsz=16222.8, num_updates=6300, lr=0.000796819, gnorm=0.384, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=8621 epoch 004: 1199 / 1707 loss=4.117, nll_loss=2.61, ppl=6.11, wps=365467, ups=0.75, wpb=489826, bsz=16222.8, num_updates=6300, lr=0.000796819, gnorm=0.384, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=8621 epoch 004: 1199 / 1707 loss=4.117, nll_loss=2.61, ppl=6.11, wps=365467, ups=0.75, wpb=489826, bsz=16222.8, num_updates=6300, lr=0.000796819, gnorm=0.384, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=8621 epoch 004: 1299 / 1707 loss=4.104, nll_loss=2.597, ppl=6.05, wps=365422, ups=0.75, wpb=489922, bsz=16160.1, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=8755 epoch 004: 1299 / 1707 loss=4.104, nll_loss=2.597, ppl=6.05, wps=365422, ups=0.75, wpb=489922, bsz=16160.1, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=8755 epoch 004: 1299 / 1707 loss=4.104, nll_loss=2.597, ppl=6.05, wps=365422, ups=0.75, wpb=489922, bsz=16160.1, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=8755 epoch 004: 1299 / 1707 loss=4.104, nll_loss=2.597, ppl=6.05, wps=365422, ups=0.75, wpb=489922, bsz=16160.1, num_updates=6400, lr=0.000790569, gnorm=0.375, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=8755 epoch 004: 1399 / 1707 loss=4.082, nll_loss=2.573, ppl=5.95, wps=366430, ups=0.75, wpb=491110, bsz=16351.7, num_updates=6500, lr=0.000784465, gnorm=0.374, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=8889 epoch 004: 1399 / 1707 loss=4.082, nll_loss=2.573, ppl=5.95, wps=366430, ups=0.75, wpb=491110, bsz=16351.7, num_updates=6500, lr=0.000784465, gnorm=0.374, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=8889 epoch 004: 1399 / 1707 loss=4.082, nll_loss=2.573, ppl=5.95, wps=366430, ups=0.75, wpb=491110, bsz=16351.7, num_updates=6500, lr=0.000784465, gnorm=0.374, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=8889 epoch 004: 1399 / 1707 loss=4.082, nll_loss=2.573, ppl=5.95, wps=366430, ups=0.75, wpb=491110, bsz=16351.7, num_updates=6500, lr=0.000784465, gnorm=0.374, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=8889 epoch 004: 1500 / 1707 loss=4.071, nll_loss=2.561, ppl=5.9, wps=359898, ups=0.74, wpb=489154, bsz=16357.4, num_updates=6600, lr=0.000778499, gnorm=0.369, clip=0, loss_scale=8, train_wall=135, gb_free=60.2, wall=9025 epoch 004: 1500 / 1707 loss=4.071, nll_loss=2.561, ppl=5.9, wps=359898, ups=0.74, wpb=489154, bsz=16357.4, num_updates=6600, lr=0.000778499, gnorm=0.369, clip=0, loss_scale=8, train_wall=135, gb_free=60.2, wall=9025 epoch 004: 1500 / 1707 loss=4.071, nll_loss=2.561, ppl=5.9, wps=359898, ups=0.74, wpb=489154, bsz=16357.4, num_updates=6600, lr=0.000778499, gnorm=0.369, clip=0, loss_scale=8, train_wall=135, gb_free=60.2, wall=9025 epoch 004: 1500 / 1707 loss=4.071, nll_loss=2.561, ppl=5.9, wps=359898, ups=0.74, wpb=489154, bsz=16357.4, num_updates=6600, lr=0.000778499, gnorm=0.369, clip=0, loss_scale=8, train_wall=135, gb_free=60.2, wall=9025 epoch 004: 1600 / 1707 loss=4.054, nll_loss=2.543, ppl=5.83, wps=365291, ups=0.74, wpb=490359, bsz=16578.4, num_updates=6700, lr=0.000772667, gnorm=0.347, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9159 epoch 004: 1600 / 1707 loss=4.054, nll_loss=2.543, ppl=5.83, wps=365291, ups=0.74, wpb=490359, bsz=16578.4, num_updates=6700, lr=0.000772667, gnorm=0.347, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9159 epoch 004: 1600 / 1707 loss=4.054, nll_loss=2.543, ppl=5.83, wps=365291, ups=0.74, wpb=490359, bsz=16578.4, num_updates=6700, lr=0.000772667, gnorm=0.347, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9159 epoch 004: 1600 / 1707 loss=4.054, nll_loss=2.543, ppl=5.83, wps=365291, ups=0.74, wpb=490359, bsz=16578.4, num_updates=6700, lr=0.000772667, gnorm=0.347, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9159 epoch 004: 1700 / 1707 loss=4.043, nll_loss=2.531, ppl=5.78, wps=364992, ups=0.74, wpb=490128, bsz=16165.8, num_updates=6800, lr=0.000766965, gnorm=0.361, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9293 epoch 004: 1700 / 1707 loss=4.043, nll_loss=2.531, ppl=5.78, wps=364992, ups=0.74, wpb=490128, bsz=16165.8, num_updates=6800, lr=0.000766965, gnorm=0.361, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9293 epoch 004: 1700 / 1707 loss=4.043, nll_loss=2.531, ppl=5.78, wps=364992, ups=0.74, wpb=490128, bsz=16165.8, num_updates=6800, lr=0.000766965, gnorm=0.361, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9293 epoch 004: 1700 / 1707 loss=4.043, nll_loss=2.531, ppl=5.78, wps=364992, ups=0.74, wpb=490128, bsz=16165.8, num_updates=6800, lr=0.000766965, gnorm=0.361, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=9293 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.259 | nll_loss 2.763 | ppl 6.79 | wps 361621 | ups 0.74 | wpb 489880 | bsz 16334 | num_updates 6807 | lr 0.000766571 | gnorm 0.436 | clip 1.2 | loss_scale 8 | train_wall 2277 | gb_free 60.8 | wall 9302 epoch 004 | loss 4.259 | nll_loss 2.763 | ppl 6.79 | wps 361621 | ups 0.74 | wpb 489880 | bsz 16334 | num_updates 6807 | lr 0.000766571 | gnorm 0.436 | clip 1.2 | loss_scale 8 | train_wall 2277 | gb_free 60.8 | wall 9302 epoch 004 | loss 4.259 | nll_loss 2.763 | ppl 6.79 | wps 361621 | ups 0.74 | wpb 489880 | bsz 16334 | num_updates 6807 | lr 0.000766571 | gnorm 0.436 | clip 1.2 | loss_scale 8 | train_wall 2277 | gb_free 60.8 | wall 9302 epoch 004 | loss 4.259 | nll_loss 2.763 | ppl 6.79 | wps 361621 | ups 0.74 | wpb 489880 | bsz 16334 | num_updates 6807 | lr 0.000766571 | gnorm 0.436 | clip 1.2 | loss_scale 8 | train_wall 2277 | gb_free 60.8 | wall 9302 Start iterating over samples epoch 005: 94 / 1707 loss=4.011, nll_loss=2.495, ppl=5.64, wps=359762, ups=0.74, wpb=486287, bsz=16207.8, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9429 epoch 005: 94 / 1707 loss=4.011, nll_loss=2.495, ppl=5.64, wps=359762, ups=0.74, wpb=486287, bsz=16207.8, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9429 epoch 005: 94 / 1707 loss=4.011, nll_loss=2.495, ppl=5.64, wps=359762, ups=0.74, wpb=486287, bsz=16207.8, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9429 epoch 005: 94 / 1707 loss=4.011, nll_loss=2.495, ppl=5.64, wps=359762, ups=0.74, wpb=486287, bsz=16207.8, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9429 epoch 005: 94 / 1707 loss=4.011, nll_loss=2.495, ppl=5.64, wps=359762, ups=0.74, wpb=486287, bsz=16207.8, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9429 epoch 005: 194 / 1707 loss=4.004, nll_loss=2.488, ppl=5.61, wps=366218, ups=0.75, wpb=490299, bsz=16329.5, num_updates=7000, lr=0.000755929, gnorm=0.351, clip=0, loss_scale=8, train_wall=133, gb_free=61.1, wall=9562 epoch 005: 194 / 1707 loss=4.004, nll_loss=2.488, ppl=5.61, wps=366218, ups=0.75, wpb=490299, bsz=16329.5, num_updates=7000, lr=0.000755929, gnorm=0.351, clip=0, loss_scale=8, train_wall=133, gb_free=61.1, wall=9562 epoch 005: 194 / 1707 loss=4.004, nll_loss=2.488, ppl=5.61, wps=366218, ups=0.75, wpb=490299, bsz=16329.5, num_updates=7000, lr=0.000755929, gnorm=0.351, clip=0, loss_scale=8, train_wall=133, gb_free=61.1, wall=9562 epoch 005: 194 / 1707 loss=4.004, nll_loss=2.488, ppl=5.61, wps=366218, ups=0.75, wpb=490299, bsz=16329.5, num_updates=7000, lr=0.000755929, gnorm=0.351, clip=0, loss_scale=8, train_wall=133, gb_free=61.1, wall=9562 epoch 005: 194 / 1707 loss=4.004, nll_loss=2.488, ppl=5.61, wps=366218, ups=0.75, wpb=490299, bsz=16329.5, num_updates=7000, lr=0.000755929, gnorm=0.351, clip=0, loss_scale=8, train_wall=133, gb_free=61.1, wall=9562 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.097 | nll_loss 2.568 | ppl 5.93 | wps 184318 | wpb 22263 | bsz 1004 | num_updates 7000 | best_loss 4.097 epoch 005 | valid on 'valid' subset | loss 4.097 | nll_loss 2.568 | ppl 5.93 | wps 184318 | wpb 22263 | bsz 1004 | num_updates 7000 | best_loss 4.097 epoch 005 | valid on 'valid' subset | loss 4.097 | nll_loss 2.568 | ppl 5.93 | wps 184318 | wpb 22263 | bsz 1004 | num_updates 7000 | best_loss 4.097 epoch 005 | valid on 'valid' subset | loss 4.097 | nll_loss 2.568 | ppl 5.93 | wps 184318 | wpb 22263 | bsz 1004 | num_updates 7000 | best_loss 4.097 epoch 005 | valid on 'valid' subset | loss 4.097 | nll_loss 2.568 | ppl 5.93 | wps 184318 | wpb 22263 | bsz 1004 | num_updates 7000 | best_loss 4.097 epoch 005: 294 / 1707 loss=3.999, nll_loss=2.483, ppl=5.59, wps=324006, ups=0.66, wpb=489550, bsz=16334.6, num_updates=7100, lr=0.000750587, gnorm=0.343, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=9714 epoch 005: 294 / 1707 loss=3.999, nll_loss=2.483, ppl=5.59, wps=324006, ups=0.66, wpb=489550, bsz=16334.6, num_updates=7100, lr=0.000750587, gnorm=0.343, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=9714 epoch 005: 294 / 1707 loss=3.999, nll_loss=2.483, ppl=5.59, wps=324006, ups=0.66, wpb=489550, bsz=16334.6, num_updates=7100, lr=0.000750587, gnorm=0.343, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=9714 epoch 005: 294 / 1707 loss=3.999, nll_loss=2.483, ppl=5.59, wps=324006, ups=0.66, wpb=489550, bsz=16334.6, num_updates=7100, lr=0.000750587, gnorm=0.343, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=9714 epoch 005: 294 / 1707 loss=3.999, nll_loss=2.483, ppl=5.59, wps=324006, ups=0.66, wpb=489550, bsz=16334.6, num_updates=7100, lr=0.000750587, gnorm=0.343, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=9714 epoch 005: 395 / 1707 loss=3.997, nll_loss=2.481, ppl=5.58, wps=363914, ups=0.74, wpb=489479, bsz=16231.9, num_updates=7200, lr=0.000745356, gnorm=0.345, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9848 epoch 005: 395 / 1707 loss=3.997, nll_loss=2.481, ppl=5.58, wps=363914, ups=0.74, wpb=489479, bsz=16231.9, num_updates=7200, lr=0.000745356, gnorm=0.345, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9848 epoch 005: 395 / 1707 loss=3.997, nll_loss=2.481, ppl=5.58, wps=363914, ups=0.74, wpb=489479, bsz=16231.9, num_updates=7200, lr=0.000745356, gnorm=0.345, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9848 epoch 005: 395 / 1707 loss=3.997, nll_loss=2.481, ppl=5.58, wps=363914, ups=0.74, wpb=489479, bsz=16231.9, num_updates=7200, lr=0.000745356, gnorm=0.345, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9848 epoch 005: 395 / 1707 loss=3.997, nll_loss=2.481, ppl=5.58, wps=363914, ups=0.74, wpb=489479, bsz=16231.9, num_updates=7200, lr=0.000745356, gnorm=0.345, clip=0, loss_scale=8, train_wall=134, gb_free=60.5, wall=9848 epoch 005: 495 / 1707 loss=3.984, nll_loss=2.467, ppl=5.53, wps=367317, ups=0.75, wpb=490906, bsz=16486.9, num_updates=7300, lr=0.000740233, gnorm=0.342, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=9982 epoch 005: 495 / 1707 loss=3.984, nll_loss=2.467, ppl=5.53, wps=367317, ups=0.75, wpb=490906, bsz=16486.9, num_updates=7300, lr=0.000740233, gnorm=0.342, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=9982 epoch 005: 495 / 1707 loss=3.984, nll_loss=2.467, ppl=5.53, wps=367317, ups=0.75, wpb=490906, bsz=16486.9, num_updates=7300, lr=0.000740233, gnorm=0.342, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=9982 epoch 005: 495 / 1707 loss=3.984, nll_loss=2.467, ppl=5.53, wps=367317, ups=0.75, wpb=490906, bsz=16486.9, num_updates=7300, lr=0.000740233, gnorm=0.342, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=9982 epoch 005: 495 / 1707 loss=3.984, nll_loss=2.467, ppl=5.53, wps=367317, ups=0.75, wpb=490906, bsz=16486.9, num_updates=7300, lr=0.000740233, gnorm=0.342, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=9982 epoch 005: 596 / 1707 loss=3.976, nll_loss=2.458, ppl=5.5, wps=361857, ups=0.74, wpb=489801, bsz=16207.4, num_updates=7400, lr=0.000735215, gnorm=0.346, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10117 epoch 005: 596 / 1707 loss=3.976, nll_loss=2.458, ppl=5.5, wps=361857, ups=0.74, wpb=489801, bsz=16207.4, num_updates=7400, lr=0.000735215, gnorm=0.346, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10117 epoch 005: 596 / 1707 loss=3.976, nll_loss=2.458, ppl=5.5, wps=361857, ups=0.74, wpb=489801, bsz=16207.4, num_updates=7400, lr=0.000735215, gnorm=0.346, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10117 epoch 005: 596 / 1707 loss=3.976, nll_loss=2.458, ppl=5.5, wps=361857, ups=0.74, wpb=489801, bsz=16207.4, num_updates=7400, lr=0.000735215, gnorm=0.346, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10117 epoch 005: 596 / 1707 loss=3.976, nll_loss=2.458, ppl=5.5, wps=361857, ups=0.74, wpb=489801, bsz=16207.4, num_updates=7400, lr=0.000735215, gnorm=0.346, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10117 epoch 005: 696 / 1707 loss=3.972, nll_loss=2.454, ppl=5.48, wps=366282, ups=0.75, wpb=490458, bsz=16266, num_updates=7500, lr=0.000730297, gnorm=0.335, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=10251 epoch 005: 696 / 1707 loss=3.972, nll_loss=2.454, ppl=5.48, wps=366282, ups=0.75, wpb=490458, bsz=16266, num_updates=7500, lr=0.000730297, gnorm=0.335, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=10251 epoch 005: 696 / 1707 loss=3.972, nll_loss=2.454, ppl=5.48, wps=366282, ups=0.75, wpb=490458, bsz=16266, num_updates=7500, lr=0.000730297, gnorm=0.335, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=10251 epoch 005: 696 / 1707 loss=3.972, nll_loss=2.454, ppl=5.48, wps=366282, ups=0.75, wpb=490458, bsz=16266, num_updates=7500, lr=0.000730297, gnorm=0.335, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=10251 epoch 005: 696 / 1707 loss=3.972, nll_loss=2.454, ppl=5.48, wps=366282, ups=0.75, wpb=490458, bsz=16266, num_updates=7500, lr=0.000730297, gnorm=0.335, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=10251 epoch 005: 796 / 1707 loss=3.965, nll_loss=2.447, ppl=5.45, wps=365446, ups=0.75, wpb=490130, bsz=16562.8, num_updates=7600, lr=0.000725476, gnorm=0.332, clip=0, loss_scale=8, train_wall=134, gb_free=60.6, wall=10385 epoch 005: 796 / 1707 loss=3.965, nll_loss=2.447, ppl=5.45, wps=365446, ups=0.75, wpb=490130, bsz=16562.8, num_updates=7600, lr=0.000725476, gnorm=0.332, clip=0, loss_scale=8, train_wall=134, gb_free=60.6, wall=10385 epoch 005: 796 / 1707 loss=3.965, nll_loss=2.447, ppl=5.45, wps=365446, ups=0.75, wpb=490130, bsz=16562.8, num_updates=7600, lr=0.000725476, gnorm=0.332, clip=0, loss_scale=8, train_wall=134, gb_free=60.6, wall=10385 epoch 005: 796 / 1707 loss=3.965, nll_loss=2.447, ppl=5.45, wps=365446, ups=0.75, wpb=490130, bsz=16562.8, num_updates=7600, lr=0.000725476, gnorm=0.332, clip=0, loss_scale=8, train_wall=134, gb_free=60.6, wall=10385 epoch 005: 796 / 1707 loss=3.965, nll_loss=2.447, ppl=5.45, wps=365446, ups=0.75, wpb=490130, bsz=16562.8, num_updates=7600, lr=0.000725476, gnorm=0.332, clip=0, loss_scale=8, train_wall=134, gb_free=60.6, wall=10385 epoch 005: 897 / 1707 loss=3.959, nll_loss=2.441, ppl=5.43, wps=363808, ups=0.74, wpb=491536, bsz=16471.8, num_updates=7700, lr=0.00072075, gnorm=0.336, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10520 epoch 005: 897 / 1707 loss=3.959, nll_loss=2.441, ppl=5.43, wps=363808, ups=0.74, wpb=491536, bsz=16471.8, num_updates=7700, lr=0.00072075, gnorm=0.336, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10520 epoch 005: 897 / 1707 loss=3.959, nll_loss=2.441, ppl=5.43, wps=363808, ups=0.74, wpb=491536, bsz=16471.8, num_updates=7700, lr=0.00072075, gnorm=0.336, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10520 epoch 005: 897 / 1707 loss=3.959, nll_loss=2.441, ppl=5.43, wps=363808, ups=0.74, wpb=491536, bsz=16471.8, num_updates=7700, lr=0.00072075, gnorm=0.336, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10520 epoch 005: 897 / 1707 loss=3.959, nll_loss=2.441, ppl=5.43, wps=363808, ups=0.74, wpb=491536, bsz=16471.8, num_updates=7700, lr=0.00072075, gnorm=0.336, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=10520 epoch 005: 997 / 1707 loss=3.958, nll_loss=2.44, ppl=5.43, wps=366448, ups=0.75, wpb=489862, bsz=16127.9, num_updates=7800, lr=0.000716115, gnorm=0.331, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=10654 epoch 005: 997 / 1707 loss=3.958, nll_loss=2.44, ppl=5.43, wps=366448, ups=0.75, wpb=489862, bsz=16127.9, num_updates=7800, lr=0.000716115, gnorm=0.331, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=10654 epoch 005: 997 / 1707 loss=3.958, nll_loss=2.44, ppl=5.43, wps=366448, ups=0.75, wpb=489862, bsz=16127.9, num_updates=7800, lr=0.000716115, gnorm=0.331, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=10654 epoch 005: 997 / 1707 loss=3.958, nll_loss=2.44, ppl=5.43, wps=366448, ups=0.75, wpb=489862, bsz=16127.9, num_updates=7800, lr=0.000716115, gnorm=0.331, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=10654 epoch 005: 997 / 1707 loss=3.958, nll_loss=2.44, ppl=5.43, wps=366448, ups=0.75, wpb=489862, bsz=16127.9, num_updates=7800, lr=0.000716115, gnorm=0.331, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=10654 epoch 005: 1097 / 1707 loss=3.949, nll_loss=2.431, ppl=5.39, wps=365968, ups=0.75, wpb=490132, bsz=16039.8, num_updates=7900, lr=0.000711568, gnorm=0.326, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=10788 epoch 005: 1097 / 1707 loss=3.949, nll_loss=2.431, ppl=5.39, wps=365968, ups=0.75, wpb=490132, bsz=16039.8, num_updates=7900, lr=0.000711568, gnorm=0.326, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=10788 epoch 005: 1097 / 1707 loss=3.949, nll_loss=2.431, ppl=5.39, wps=365968, ups=0.75, wpb=490132, bsz=16039.8, num_updates=7900, lr=0.000711568, gnorm=0.326, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=10788 epoch 005: 1097 / 1707 loss=3.949, nll_loss=2.431, ppl=5.39, wps=365968, ups=0.75, wpb=490132, bsz=16039.8, num_updates=7900, lr=0.000711568, gnorm=0.326, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=10788 epoch 005: 1097 / 1707 loss=3.949, nll_loss=2.431, ppl=5.39, wps=365968, ups=0.75, wpb=490132, bsz=16039.8, num_updates=7900, lr=0.000711568, gnorm=0.326, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=10788 epoch 005: 1198 / 1707 loss=3.947, nll_loss=2.429, ppl=5.38, wps=360197, ups=0.74, wpb=489135, bsz=16418.1, num_updates=8000, lr=0.000707107, gnorm=0.342, clip=0, loss_scale=8, train_wall=135, gb_free=60.8, wall=10924 epoch 005: 1198 / 1707 loss=3.947, nll_loss=2.429, ppl=5.38, wps=360197, ups=0.74, wpb=489135, bsz=16418.1, num_updates=8000, lr=0.000707107, gnorm=0.342, clip=0, loss_scale=8, train_wall=135, gb_free=60.8, wall=10924 epoch 005: 1198 / 1707 loss=3.947, nll_loss=2.429, ppl=5.38, wps=360197, ups=0.74, wpb=489135, bsz=16418.1, num_updates=8000, lr=0.000707107, gnorm=0.342, clip=0, loss_scale=8, train_wall=135, gb_free=60.8, wall=10924 epoch 005: 1198 / 1707 loss=3.947, nll_loss=2.429, ppl=5.38, wps=360197, ups=0.74, wpb=489135, bsz=16418.1, num_updates=8000, lr=0.000707107, gnorm=0.342, clip=0, loss_scale=8, train_wall=135, gb_free=60.8, wall=10924 epoch 005: 1198 / 1707 loss=3.947, nll_loss=2.429, ppl=5.38, wps=360197, ups=0.74, wpb=489135, bsz=16418.1, num_updates=8000, lr=0.000707107, gnorm=0.342, clip=0, loss_scale=8, train_wall=135, gb_free=60.8, wall=10924 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.042 | nll_loss 2.514 | ppl 5.71 | wps 219986 | wpb 22263 | bsz 1004 | num_updates 8000 | best_loss 4.042 epoch 005 | valid on 'valid' subset | loss 4.042 | nll_loss 2.514 | ppl 5.71 | wps 219986 | wpb 22263 | bsz 1004 | num_updates 8000 | best_loss 4.042 epoch 005 | valid on 'valid' subset | loss 4.042 | nll_loss 2.514 | ppl 5.71 | wps 219986 | wpb 22263 | bsz 1004 | num_updates 8000 | best_loss 4.042 epoch 005 | valid on 'valid' subset | loss 4.042 | nll_loss 2.514 | ppl 5.71 | wps 219986 | wpb 22263 | bsz 1004 | num_updates 8000 | best_loss 4.042 epoch 005 | valid on 'valid' subset | loss 4.042 | nll_loss 2.514 | ppl 5.71 | wps 219986 | wpb 22263 | bsz 1004 | num_updates 8000 | best_loss 4.042 epoch 005: 1298 / 1707 loss=3.942, nll_loss=2.424, ppl=5.37, wps=327438, ups=0.67, wpb=490342, bsz=16395.9, num_updates=8100, lr=0.000702728, gnorm=0.333, clip=0, loss_scale=8, train_wall=133, gb_free=60.7, wall=11073 epoch 005: 1298 / 1707 loss=3.942, nll_loss=2.424, ppl=5.37, wps=327438, ups=0.67, wpb=490342, bsz=16395.9, num_updates=8100, lr=0.000702728, gnorm=0.333, clip=0, loss_scale=8, train_wall=133, gb_free=60.7, wall=11073 epoch 005: 1298 / 1707 loss=3.942, nll_loss=2.424, ppl=5.37, wps=327438, ups=0.67, wpb=490342, bsz=16395.9, num_updates=8100, lr=0.000702728, gnorm=0.333, clip=0, loss_scale=8, train_wall=133, gb_free=60.7, wall=11073 epoch 005: 1298 / 1707 loss=3.942, nll_loss=2.424, ppl=5.37, wps=327438, ups=0.67, wpb=490342, bsz=16395.9, num_updates=8100, lr=0.000702728, gnorm=0.333, clip=0, loss_scale=8, train_wall=133, gb_free=60.7, wall=11073 epoch 005: 1298 / 1707 loss=3.942, nll_loss=2.424, ppl=5.37, wps=327438, ups=0.67, wpb=490342, bsz=16395.9, num_updates=8100, lr=0.000702728, gnorm=0.333, clip=0, loss_scale=8, train_wall=133, gb_free=60.7, wall=11073 epoch 005: 1399 / 1707 loss=3.935, nll_loss=2.416, ppl=5.34, wps=361720, ups=0.74, wpb=489618, bsz=16563.8, num_updates=8200, lr=0.00069843, gnorm=0.317, clip=0, loss_scale=8, train_wall=135, gb_free=60.6, wall=11209 epoch 005: 1399 / 1707 loss=3.935, nll_loss=2.416, ppl=5.34, wps=361720, ups=0.74, wpb=489618, bsz=16563.8, num_updates=8200, lr=0.00069843, gnorm=0.317, clip=0, loss_scale=8, train_wall=135, gb_free=60.6, wall=11209 epoch 005: 1399 / 1707 loss=3.935, nll_loss=2.416, ppl=5.34, wps=361720, ups=0.74, wpb=489618, bsz=16563.8, num_updates=8200, lr=0.00069843, gnorm=0.317, clip=0, loss_scale=8, train_wall=135, gb_free=60.6, wall=11209 epoch 005: 1399 / 1707 loss=3.935, nll_loss=2.416, ppl=5.34, wps=361720, ups=0.74, wpb=489618, bsz=16563.8, num_updates=8200, lr=0.00069843, gnorm=0.317, clip=0, loss_scale=8, train_wall=135, gb_free=60.6, wall=11209 epoch 005: 1399 / 1707 loss=3.935, nll_loss=2.416, ppl=5.34, wps=361720, ups=0.74, wpb=489618, bsz=16563.8, num_updates=8200, lr=0.00069843, gnorm=0.317, clip=0, loss_scale=8, train_wall=135, gb_free=60.6, wall=11209 epoch 005: 1499 / 1707 loss=3.93, nll_loss=2.411, ppl=5.32, wps=366112, ups=0.75, wpb=491042, bsz=16514, num_updates=8300, lr=0.00069421, gnorm=0.316, clip=0, loss_scale=8, train_wall=134, gb_free=60.7, wall=11343 epoch 005: 1499 / 1707 loss=3.93, nll_loss=2.411, ppl=5.32, wps=366112, ups=0.75, wpb=491042, bsz=16514, num_updates=8300, lr=0.00069421, gnorm=0.316, clip=0, loss_scale=8, train_wall=134, gb_free=60.7, wall=11343 epoch 005: 1499 / 1707 loss=3.93, nll_loss=2.411, ppl=5.32, wps=366112, ups=0.75, wpb=491042, bsz=16514, num_updates=8300, lr=0.00069421, gnorm=0.316, clip=0, loss_scale=8, train_wall=134, gb_free=60.7, wall=11343 epoch 005: 1499 / 1707 loss=3.93, nll_loss=2.411, ppl=5.32, wps=366112, ups=0.75, wpb=491042, bsz=16514, num_updates=8300, lr=0.00069421, gnorm=0.316, clip=0, loss_scale=8, train_wall=134, gb_free=60.7, wall=11343 epoch 005: 1499 / 1707 loss=3.93, nll_loss=2.411, ppl=5.32, wps=366112, ups=0.75, wpb=491042, bsz=16514, num_updates=8300, lr=0.00069421, gnorm=0.316, clip=0, loss_scale=8, train_wall=134, gb_free=60.7, wall=11343 epoch 005: 1599 / 1707 loss=3.927, nll_loss=2.407, ppl=5.3, wps=365623, ups=0.75, wpb=489802, bsz=16205, num_updates=8400, lr=0.000690066, gnorm=0.318, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=11477 epoch 005: 1599 / 1707 loss=3.927, nll_loss=2.407, ppl=5.3, wps=365623, ups=0.75, wpb=489802, bsz=16205, num_updates=8400, lr=0.000690066, gnorm=0.318, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=11477 epoch 005: 1599 / 1707 loss=3.927, nll_loss=2.407, ppl=5.3, wps=365623, ups=0.75, wpb=489802, bsz=16205, num_updates=8400, lr=0.000690066, gnorm=0.318, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=11477 epoch 005: 1599 / 1707 loss=3.927, nll_loss=2.407, ppl=5.3, wps=365623, ups=0.75, wpb=489802, bsz=16205, num_updates=8400, lr=0.000690066, gnorm=0.318, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=11477 epoch 005: 1599 / 1707 loss=3.927, nll_loss=2.407, ppl=5.3, wps=365623, ups=0.75, wpb=489802, bsz=16205, num_updates=8400, lr=0.000690066, gnorm=0.318, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=11477 epoch 005: 1700 / 1707 loss=3.923, nll_loss=2.404, ppl=5.29, wps=363054, ups=0.74, wpb=489786, bsz=16219.8, num_updates=8500, lr=0.000685994, gnorm=0.312, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=11612 epoch 005: 1700 / 1707 loss=3.923, nll_loss=2.404, ppl=5.29, wps=363054, ups=0.74, wpb=489786, bsz=16219.8, num_updates=8500, lr=0.000685994, gnorm=0.312, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=11612 epoch 005: 1700 / 1707 loss=3.923, nll_loss=2.404, ppl=5.29, wps=363054, ups=0.74, wpb=489786, bsz=16219.8, num_updates=8500, lr=0.000685994, gnorm=0.312, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=11612 epoch 005: 1700 / 1707 loss=3.923, nll_loss=2.404, ppl=5.29, wps=363054, ups=0.74, wpb=489786, bsz=16219.8, num_updates=8500, lr=0.000685994, gnorm=0.312, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=11612 epoch 005: 1700 / 1707 loss=3.923, nll_loss=2.404, ppl=5.29, wps=363054, ups=0.74, wpb=489786, bsz=16219.8, num_updates=8500, lr=0.000685994, gnorm=0.312, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=11612 end of epoch 5 (average epoch stats below) epoch 005 | loss 3.963 | nll_loss 2.445 | ppl 5.45 | wps 359270 | ups 0.73 | wpb 489902 | bsz 16332.1 | num_updates 8507 | lr 0.000685712 | gnorm 0.333 | clip 0 | loss_scale 8 | train_wall 2276 | gb_free 61.5 | wall 11620 epoch 005 | loss 3.963 | nll_loss 2.445 | ppl 5.45 | wps 359270 | ups 0.73 | wpb 489902 | bsz 16332.1 | num_updates 8507 | lr 0.000685712 | gnorm 0.333 | clip 0 | loss_scale 8 | train_wall 2276 | gb_free 61.5 | wall 11620 epoch 005 | loss 3.963 | nll_loss 2.445 | ppl 5.45 | wps 359270 | ups 0.73 | wpb 489902 | bsz 16332.1 | num_updates 8507 | lr 0.000685712 | gnorm 0.333 | clip 0 | loss_scale 8 | train_wall 2276 | gb_free 61.5 | wall 11620 epoch 005 | loss 3.963 | nll_loss 2.445 | ppl 5.45 | wps 359270 | ups 0.73 | wpb 489902 | bsz 16332.1 | num_updates 8507 | lr 0.000685712 | gnorm 0.333 | clip 0 | loss_scale 8 | train_wall 2276 | gb_free 61.5 | wall 11620 epoch 005 | loss 3.963 | nll_loss 2.445 | ppl 5.45 | wps 359270 | ups 0.73 | wpb 489902 | bsz 16332.1 | num_updates 8507 | lr 0.000685712 | gnorm 0.333 | clip 0 | loss_scale 8 | train_wall 2276 | gb_free 61.5 | wall 11620 Start iterating over samples epoch 006: 94 / 1707 loss=3.883, nll_loss=2.358, ppl=5.13, wps=362108, ups=0.74, wpb=487470, bsz=16414, num_updates=8600, lr=0.000681994, gnorm=0.324, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=11746 epoch 006: 94 / 1707 loss=3.883, nll_loss=2.358, ppl=5.13, wps=362108, ups=0.74, wpb=487470, bsz=16414, num_updates=8600, lr=0.000681994, gnorm=0.324, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=11746 epoch 006: 94 / 1707 loss=3.883, nll_loss=2.358, ppl=5.13, wps=362108, ups=0.74, wpb=487470, bsz=16414, num_updates=8600, lr=0.000681994, gnorm=0.324, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=11746 epoch 006: 94 / 1707 loss=3.883, nll_loss=2.358, ppl=5.13, wps=362108, ups=0.74, wpb=487470, bsz=16414, num_updates=8600, lr=0.000681994, gnorm=0.324, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=11746 epoch 006: 94 / 1707 loss=3.883, nll_loss=2.358, ppl=5.13, wps=362108, ups=0.74, wpb=487470, bsz=16414, num_updates=8600, lr=0.000681994, gnorm=0.324, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=11746 epoch 006: 94 / 1707 loss=3.883, nll_loss=2.358, ppl=5.13, wps=362108, ups=0.74, wpb=487470, bsz=16414, num_updates=8600, lr=0.000681994, gnorm=0.324, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=11746 epoch 006: 194 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=366763, ups=0.75, wpb=491210, bsz=16245.4, num_updates=8700, lr=0.000678064, gnorm=0.31, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=11880 epoch 006: 194 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=366763, ups=0.75, wpb=491210, bsz=16245.4, num_updates=8700, lr=0.000678064, gnorm=0.31, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=11880 epoch 006: 194 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=366763, ups=0.75, wpb=491210, bsz=16245.4, num_updates=8700, lr=0.000678064, gnorm=0.31, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=11880 epoch 006: 194 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=366763, ups=0.75, wpb=491210, bsz=16245.4, num_updates=8700, lr=0.000678064, gnorm=0.31, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=11880 epoch 006: 194 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=366763, ups=0.75, wpb=491210, bsz=16245.4, num_updates=8700, lr=0.000678064, gnorm=0.31, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=11880 epoch 006: 194 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=366763, ups=0.75, wpb=491210, bsz=16245.4, num_updates=8700, lr=0.000678064, gnorm=0.31, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=11880 epoch 006: 294 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=365908, ups=0.75, wpb=489205, bsz=16331.5, num_updates=8800, lr=0.0006742, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=12014 epoch 006: 294 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=365908, ups=0.75, wpb=489205, bsz=16331.5, num_updates=8800, lr=0.0006742, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=12014 epoch 006: 294 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=365908, ups=0.75, wpb=489205, bsz=16331.5, num_updates=8800, lr=0.0006742, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=12014 epoch 006: 294 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=365908, ups=0.75, wpb=489205, bsz=16331.5, num_updates=8800, lr=0.0006742, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=12014 epoch 006: 294 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=365908, ups=0.75, wpb=489205, bsz=16331.5, num_updates=8800, lr=0.0006742, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=12014 epoch 006: 294 / 1707 loss=3.884, nll_loss=2.359, ppl=5.13, wps=365908, ups=0.75, wpb=489205, bsz=16331.5, num_updates=8800, lr=0.0006742, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=12014 epoch 006: 394 / 1707 loss=3.88, nll_loss=2.355, ppl=5.12, wps=365552, ups=0.75, wpb=489795, bsz=16215.9, num_updates=8900, lr=0.000670402, gnorm=0.31, clip=0, loss_scale=8, train_wall=134, gb_free=61.1, wall=12148 epoch 006: 394 / 1707 loss=3.88, nll_loss=2.355, ppl=5.12, wps=365552, ups=0.75, wpb=489795, bsz=16215.9, num_updates=8900, lr=0.000670402, gnorm=0.31, clip=0, loss_scale=8, train_wall=134, gb_free=61.1, wall=12148 epoch 006: 394 / 1707 loss=3.88, nll_loss=2.355, ppl=5.12, wps=365552, ups=0.75, wpb=489795, bsz=16215.9, num_updates=8900, lr=0.000670402, gnorm=0.31, clip=0, loss_scale=8, train_wall=134, gb_free=61.1, wall=12148 epoch 006: 394 / 1707 loss=3.88, nll_loss=2.355, ppl=5.12, wps=365552, ups=0.75, wpb=489795, bsz=16215.9, num_updates=8900, lr=0.000670402, gnorm=0.31, clip=0, loss_scale=8, train_wall=134, gb_free=61.1, wall=12148 epoch 006: 394 / 1707 loss=3.88, nll_loss=2.355, ppl=5.12, wps=365552, ups=0.75, wpb=489795, bsz=16215.9, num_updates=8900, lr=0.000670402, gnorm=0.31, clip=0, loss_scale=8, train_wall=134, gb_free=61.1, wall=12148 epoch 006: 394 / 1707 loss=3.88, nll_loss=2.355, ppl=5.12, wps=365552, ups=0.75, wpb=489795, bsz=16215.9, num_updates=8900, lr=0.000670402, gnorm=0.31, clip=0, loss_scale=8, train_wall=134, gb_free=61.1, wall=12148 epoch 006: 494 / 1707 loss=3.888, nll_loss=2.365, ppl=5.15, wps=367103, ups=0.75, wpb=489924, bsz=16322.4, num_updates=9000, lr=0.000666667, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=12281 epoch 006: 494 / 1707 loss=3.888, nll_loss=2.365, ppl=5.15, wps=367103, ups=0.75, wpb=489924, bsz=16322.4, num_updates=9000, lr=0.000666667, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=12281 epoch 006: 494 / 1707 loss=3.888, nll_loss=2.365, ppl=5.15, wps=367103, ups=0.75, wpb=489924, bsz=16322.4, num_updates=9000, lr=0.000666667, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=12281 epoch 006: 494 / 1707 loss=3.888, nll_loss=2.365, ppl=5.15, wps=367103, ups=0.75, wpb=489924, bsz=16322.4, num_updates=9000, lr=0.000666667, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=12281 epoch 006: 494 / 1707 loss=3.888, nll_loss=2.365, ppl=5.15, wps=367103, ups=0.75, wpb=489924, bsz=16322.4, num_updates=9000, lr=0.000666667, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=12281 epoch 006: 494 / 1707 loss=3.888, nll_loss=2.365, ppl=5.15, wps=367103, ups=0.75, wpb=489924, bsz=16322.4, num_updates=9000, lr=0.000666667, gnorm=0.308, clip=0, loss_scale=8, train_wall=133, gb_free=60.9, wall=12281 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.987 | nll_loss 2.45 | ppl 5.47 | wps 219344 | wpb 22263 | bsz 1004 | num_updates 9000 | best_loss 3.987 epoch 006 | valid on 'valid' subset | loss 3.987 | nll_loss 2.45 | ppl 5.47 | wps 219344 | wpb 22263 | bsz 1004 | num_updates 9000 | best_loss 3.987 epoch 006 | valid on 'valid' subset | loss 3.987 | nll_loss 2.45 | ppl 5.47 | wps 219344 | wpb 22263 | bsz 1004 | num_updates 9000 | best_loss 3.987 epoch 006 | valid on 'valid' subset | loss 3.987 | nll_loss 2.45 | ppl 5.47 | wps 219344 | wpb 22263 | bsz 1004 | num_updates 9000 | best_loss 3.987 epoch 006 | valid on 'valid' subset | loss 3.987 | nll_loss 2.45 | ppl 5.47 | wps 219344 | wpb 22263 | bsz 1004 | num_updates 9000 | best_loss 3.987 epoch 006 | valid on 'valid' subset | loss 3.987 | nll_loss 2.45 | ppl 5.47 | wps 219344 | wpb 22263 | bsz 1004 | num_updates 9000 | best_loss 3.987 epoch 006: 595 / 1707 loss=3.879, nll_loss=2.355, ppl=5.11, wps=323405, ups=0.66, wpb=490274, bsz=16308.4, num_updates=9100, lr=0.000662994, gnorm=0.302, clip=0, loss_scale=8, train_wall=135, gb_free=60.3, wall=12433 epoch 006: 595 / 1707 loss=3.879, nll_loss=2.355, ppl=5.11, wps=323405, ups=0.66, wpb=490274, bsz=16308.4, num_updates=9100, lr=0.000662994, gnorm=0.302, clip=0, loss_scale=8, train_wall=135, gb_free=60.3, wall=12433 epoch 006: 595 / 1707 loss=3.879, nll_loss=2.355, ppl=5.11, wps=323405, ups=0.66, wpb=490274, bsz=16308.4, num_updates=9100, lr=0.000662994, gnorm=0.302, clip=0, loss_scale=8, train_wall=135, gb_free=60.3, wall=12433 epoch 006: 595 / 1707 loss=3.879, nll_loss=2.355, ppl=5.11, wps=323405, ups=0.66, wpb=490274, bsz=16308.4, num_updates=9100, lr=0.000662994, gnorm=0.302, clip=0, loss_scale=8, train_wall=135, gb_free=60.3, wall=12433 epoch 006: 595 / 1707 loss=3.879, nll_loss=2.355, ppl=5.11, wps=323405, ups=0.66, wpb=490274, bsz=16308.4, num_updates=9100, lr=0.000662994, gnorm=0.302, clip=0, loss_scale=8, train_wall=135, gb_free=60.3, wall=12433 epoch 006: 595 / 1707 loss=3.879, nll_loss=2.355, ppl=5.11, wps=323405, ups=0.66, wpb=490274, bsz=16308.4, num_updates=9100, lr=0.000662994, gnorm=0.302, clip=0, loss_scale=8, train_wall=135, gb_free=60.3, wall=12433 epoch 006: 695 / 1707 loss=3.881, nll_loss=2.357, ppl=5.12, wps=367169, ups=0.75, wpb=489930, bsz=16217.9, num_updates=9200, lr=0.00065938, gnorm=0.313, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=12567 epoch 006: 695 / 1707 loss=3.881, nll_loss=2.357, ppl=5.12, wps=367169, ups=0.75, wpb=489930, bsz=16217.9, num_updates=9200, lr=0.00065938, gnorm=0.313, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=12567 epoch 006: 695 / 1707 loss=3.881, nll_loss=2.357, ppl=5.12, wps=367169, ups=0.75, wpb=489930, bsz=16217.9, num_updates=9200, lr=0.00065938, gnorm=0.313, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=12567 epoch 006: 695 / 1707 loss=3.881, nll_loss=2.357, ppl=5.12, wps=367169, ups=0.75, wpb=489930, bsz=16217.9, num_updates=9200, lr=0.00065938, gnorm=0.313, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=12567 epoch 006: 695 / 1707 loss=3.881, nll_loss=2.357, ppl=5.12, wps=367169, ups=0.75, wpb=489930, bsz=16217.9, num_updates=9200, lr=0.00065938, gnorm=0.313, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=12567 epoch 006: 695 / 1707 loss=3.881, nll_loss=2.357, ppl=5.12, wps=367169, ups=0.75, wpb=489930, bsz=16217.9, num_updates=9200, lr=0.00065938, gnorm=0.313, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=12567 epoch 006: 796 / 1707 loss=3.877, nll_loss=2.353, ppl=5.11, wps=361285, ups=0.74, wpb=488931, bsz=16544.6, num_updates=9300, lr=0.000655826, gnorm=0.309, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=12702 epoch 006: 796 / 1707 loss=3.877, nll_loss=2.353, ppl=5.11, wps=361285, ups=0.74, wpb=488931, bsz=16544.6, num_updates=9300, lr=0.000655826, gnorm=0.309, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=12702 epoch 006: 796 / 1707 loss=3.877, nll_loss=2.353, ppl=5.11, wps=361285, ups=0.74, wpb=488931, bsz=16544.6, num_updates=9300, lr=0.000655826, gnorm=0.309, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=12702 epoch 006: 796 / 1707 loss=3.877, nll_loss=2.353, ppl=5.11, wps=361285, ups=0.74, wpb=488931, bsz=16544.6, num_updates=9300, lr=0.000655826, gnorm=0.309, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=12702 epoch 006: 796 / 1707 loss=3.877, nll_loss=2.353, ppl=5.11, wps=361285, ups=0.74, wpb=488931, bsz=16544.6, num_updates=9300, lr=0.000655826, gnorm=0.309, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=12702 epoch 006: 796 / 1707 loss=3.877, nll_loss=2.353, ppl=5.11, wps=361285, ups=0.74, wpb=488931, bsz=16544.6, num_updates=9300, lr=0.000655826, gnorm=0.309, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=12702 epoch 006: 896 / 1707 loss=3.869, nll_loss=2.344, ppl=5.08, wps=365927, ups=0.75, wpb=489886, bsz=16182.3, num_updates=9400, lr=0.000652328, gnorm=0.301, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=12836 epoch 006: 896 / 1707 loss=3.869, nll_loss=2.344, ppl=5.08, wps=365927, ups=0.75, wpb=489886, bsz=16182.3, num_updates=9400, lr=0.000652328, gnorm=0.301, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=12836 epoch 006: 896 / 1707 loss=3.869, nll_loss=2.344, ppl=5.08, wps=365927, ups=0.75, wpb=489886, bsz=16182.3, num_updates=9400, lr=0.000652328, gnorm=0.301, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=12836 epoch 006: 896 / 1707 loss=3.869, nll_loss=2.344, ppl=5.08, wps=365927, ups=0.75, wpb=489886, bsz=16182.3, num_updates=9400, lr=0.000652328, gnorm=0.301, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=12836 epoch 006: 896 / 1707 loss=3.869, nll_loss=2.344, ppl=5.08, wps=365927, ups=0.75, wpb=489886, bsz=16182.3, num_updates=9400, lr=0.000652328, gnorm=0.301, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=12836 epoch 006: 896 / 1707 loss=3.869, nll_loss=2.344, ppl=5.08, wps=365927, ups=0.75, wpb=489886, bsz=16182.3, num_updates=9400, lr=0.000652328, gnorm=0.301, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=12836 epoch 006: 996 / 1707 loss=3.871, nll_loss=2.347, ppl=5.09, wps=365685, ups=0.75, wpb=490604, bsz=16059.7, num_updates=9500, lr=0.000648886, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=12970 epoch 006: 996 / 1707 loss=3.871, nll_loss=2.347, ppl=5.09, wps=365685, ups=0.75, wpb=490604, bsz=16059.7, num_updates=9500, lr=0.000648886, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=12970 epoch 006: 996 / 1707 loss=3.871, nll_loss=2.347, ppl=5.09, wps=365685, ups=0.75, wpb=490604, bsz=16059.7, num_updates=9500, lr=0.000648886, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=12970 epoch 006: 996 / 1707 loss=3.871, nll_loss=2.347, ppl=5.09, wps=365685, ups=0.75, wpb=490604, bsz=16059.7, num_updates=9500, lr=0.000648886, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=12970 epoch 006: 996 / 1707 loss=3.871, nll_loss=2.347, ppl=5.09, wps=365685, ups=0.75, wpb=490604, bsz=16059.7, num_updates=9500, lr=0.000648886, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=12970 epoch 006: 996 / 1707 loss=3.871, nll_loss=2.347, ppl=5.09, wps=365685, ups=0.75, wpb=490604, bsz=16059.7, num_updates=9500, lr=0.000648886, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=12970 epoch 006: 1096 / 1707 loss=3.863, nll_loss=2.338, ppl=5.06, wps=367953, ups=0.75, wpb=491268, bsz=16374.7, num_updates=9600, lr=0.000645497, gnorm=0.306, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=13103 epoch 006: 1096 / 1707 loss=3.863, nll_loss=2.338, ppl=5.06, wps=367953, ups=0.75, wpb=491268, bsz=16374.7, num_updates=9600, lr=0.000645497, gnorm=0.306, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=13103 epoch 006: 1096 / 1707 loss=3.863, nll_loss=2.338, ppl=5.06, wps=367953, ups=0.75, wpb=491268, bsz=16374.7, num_updates=9600, lr=0.000645497, gnorm=0.306, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=13103 epoch 006: 1096 / 1707 loss=3.863, nll_loss=2.338, ppl=5.06, wps=367953, ups=0.75, wpb=491268, bsz=16374.7, num_updates=9600, lr=0.000645497, gnorm=0.306, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=13103 epoch 006: 1096 / 1707 loss=3.863, nll_loss=2.338, ppl=5.06, wps=367953, ups=0.75, wpb=491268, bsz=16374.7, num_updates=9600, lr=0.000645497, gnorm=0.306, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=13103 epoch 006: 1096 / 1707 loss=3.863, nll_loss=2.338, ppl=5.06, wps=367953, ups=0.75, wpb=491268, bsz=16374.7, num_updates=9600, lr=0.000645497, gnorm=0.306, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=13103 epoch 006: 1196 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=366600, ups=0.75, wpb=489741, bsz=16539.1, num_updates=9700, lr=0.000642161, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13237 epoch 006: 1196 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=366600, ups=0.75, wpb=489741, bsz=16539.1, num_updates=9700, lr=0.000642161, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13237 epoch 006: 1196 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=366600, ups=0.75, wpb=489741, bsz=16539.1, num_updates=9700, lr=0.000642161, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13237 epoch 006: 1196 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=366600, ups=0.75, wpb=489741, bsz=16539.1, num_updates=9700, lr=0.000642161, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13237 epoch 006: 1196 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=366600, ups=0.75, wpb=489741, bsz=16539.1, num_updates=9700, lr=0.000642161, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13237 epoch 006: 1196 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=366600, ups=0.75, wpb=489741, bsz=16539.1, num_updates=9700, lr=0.000642161, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13237 epoch 006: 1297 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=361050, ups=0.74, wpb=489632, bsz=16372.5, num_updates=9800, lr=0.000638877, gnorm=0.3, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=13373 epoch 006: 1297 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=361050, ups=0.74, wpb=489632, bsz=16372.5, num_updates=9800, lr=0.000638877, gnorm=0.3, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=13373 epoch 006: 1297 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=361050, ups=0.74, wpb=489632, bsz=16372.5, num_updates=9800, lr=0.000638877, gnorm=0.3, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=13373 epoch 006: 1297 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=361050, ups=0.74, wpb=489632, bsz=16372.5, num_updates=9800, lr=0.000638877, gnorm=0.3, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=13373 epoch 006: 1297 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=361050, ups=0.74, wpb=489632, bsz=16372.5, num_updates=9800, lr=0.000638877, gnorm=0.3, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=13373 epoch 006: 1297 / 1707 loss=3.864, nll_loss=2.339, ppl=5.06, wps=361050, ups=0.74, wpb=489632, bsz=16372.5, num_updates=9800, lr=0.000638877, gnorm=0.3, clip=0, loss_scale=8, train_wall=135, gb_free=60.4, wall=13373 epoch 006: 1397 / 1707 loss=3.857, nll_loss=2.332, ppl=5.04, wps=365296, ups=0.75, wpb=489975, bsz=16344.5, num_updates=9900, lr=0.000635642, gnorm=0.297, clip=0, loss_scale=8, train_wall=134, gb_free=60.3, wall=13507 epoch 006: 1397 / 1707 loss=3.857, nll_loss=2.332, ppl=5.04, wps=365296, ups=0.75, wpb=489975, bsz=16344.5, num_updates=9900, lr=0.000635642, gnorm=0.297, clip=0, loss_scale=8, train_wall=134, gb_free=60.3, wall=13507 epoch 006: 1397 / 1707 loss=3.857, nll_loss=2.332, ppl=5.04, wps=365296, ups=0.75, wpb=489975, bsz=16344.5, num_updates=9900, lr=0.000635642, gnorm=0.297, clip=0, loss_scale=8, train_wall=134, gb_free=60.3, wall=13507 epoch 006: 1397 / 1707 loss=3.857, nll_loss=2.332, ppl=5.04, wps=365296, ups=0.75, wpb=489975, bsz=16344.5, num_updates=9900, lr=0.000635642, gnorm=0.297, clip=0, loss_scale=8, train_wall=134, gb_free=60.3, wall=13507 epoch 006: 1397 / 1707 loss=3.857, nll_loss=2.332, ppl=5.04, wps=365296, ups=0.75, wpb=489975, bsz=16344.5, num_updates=9900, lr=0.000635642, gnorm=0.297, clip=0, loss_scale=8, train_wall=134, gb_free=60.3, wall=13507 epoch 006: 1397 / 1707 loss=3.857, nll_loss=2.332, ppl=5.04, wps=365296, ups=0.75, wpb=489975, bsz=16344.5, num_updates=9900, lr=0.000635642, gnorm=0.297, clip=0, loss_scale=8, train_wall=134, gb_free=60.3, wall=13507 epoch 006: 1497 / 1707 loss=3.852, nll_loss=2.327, ppl=5.02, wps=367099, ups=0.75, wpb=491299, bsz=16241.5, num_updates=10000, lr=0.000632456, gnorm=0.298, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13641 epoch 006: 1497 / 1707 loss=3.852, nll_loss=2.327, ppl=5.02, wps=367099, ups=0.75, wpb=491299, bsz=16241.5, num_updates=10000, lr=0.000632456, gnorm=0.298, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13641 epoch 006: 1497 / 1707 loss=3.852, nll_loss=2.327, ppl=5.02, wps=367099, ups=0.75, wpb=491299, bsz=16241.5, num_updates=10000, lr=0.000632456, gnorm=0.298, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13641 epoch 006: 1497 / 1707 loss=3.852, nll_loss=2.327, ppl=5.02, wps=367099, ups=0.75, wpb=491299, bsz=16241.5, num_updates=10000, lr=0.000632456, gnorm=0.298, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13641 epoch 006: 1497 / 1707 loss=3.852, nll_loss=2.327, ppl=5.02, wps=367099, ups=0.75, wpb=491299, bsz=16241.5, num_updates=10000, lr=0.000632456, gnorm=0.298, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13641 epoch 006: 1497 / 1707 loss=3.852, nll_loss=2.327, ppl=5.02, wps=367099, ups=0.75, wpb=491299, bsz=16241.5, num_updates=10000, lr=0.000632456, gnorm=0.298, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=13641 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.95 | nll_loss 2.409 | ppl 5.31 | wps 220331 | wpb 22263 | bsz 1004 | num_updates 10000 | best_loss 3.95 epoch 006 | valid on 'valid' subset | loss 3.95 | nll_loss 2.409 | ppl 5.31 | wps 220331 | wpb 22263 | bsz 1004 | num_updates 10000 | best_loss 3.95 epoch 006 | valid on 'valid' subset | loss 3.95 | nll_loss 2.409 | ppl 5.31 | wps 220331 | wpb 22263 | bsz 1004 | num_updates 10000 | best_loss 3.95 epoch 006 | valid on 'valid' subset | loss 3.95 | nll_loss 2.409 | ppl 5.31 | wps 220331 | wpb 22263 | bsz 1004 | num_updates 10000 | best_loss 3.95 epoch 006 | valid on 'valid' subset | loss 3.95 | nll_loss 2.409 | ppl 5.31 | wps 220331 | wpb 22263 | bsz 1004 | num_updates 10000 | best_loss 3.95 epoch 006 | valid on 'valid' subset | loss 3.95 | nll_loss 2.409 | ppl 5.31 | wps 220331 | wpb 22263 | bsz 1004 | num_updates 10000 | best_loss 3.95 epoch 006: 1599 / 1707 loss=3.854, nll_loss=2.33, ppl=5.03, wps=319925, ups=0.65, wpb=489707, bsz=16619.5, num_updates=10100, lr=0.000629317, gnorm=0.312, clip=0, loss_scale=4, train_wall=136, gb_free=60.5, wall=13794 epoch 006: 1599 / 1707 loss=3.854, nll_loss=2.33, ppl=5.03, wps=319925, ups=0.65, wpb=489707, bsz=16619.5, num_updates=10100, lr=0.000629317, gnorm=0.312, clip=0, loss_scale=4, train_wall=136, gb_free=60.5, wall=13794 epoch 006: 1599 / 1707 loss=3.854, nll_loss=2.33, ppl=5.03, wps=319925, ups=0.65, wpb=489707, bsz=16619.5, num_updates=10100, lr=0.000629317, gnorm=0.312, clip=0, loss_scale=4, train_wall=136, gb_free=60.5, wall=13794 epoch 006: 1599 / 1707 loss=3.854, nll_loss=2.33, ppl=5.03, wps=319925, ups=0.65, wpb=489707, bsz=16619.5, num_updates=10100, lr=0.000629317, gnorm=0.312, clip=0, loss_scale=4, train_wall=136, gb_free=60.5, wall=13794 epoch 006: 1599 / 1707 loss=3.854, nll_loss=2.33, ppl=5.03, wps=319925, ups=0.65, wpb=489707, bsz=16619.5, num_updates=10100, lr=0.000629317, gnorm=0.312, clip=0, loss_scale=4, train_wall=136, gb_free=60.5, wall=13794 epoch 006: 1599 / 1707 loss=3.854, nll_loss=2.33, ppl=5.03, wps=319925, ups=0.65, wpb=489707, bsz=16619.5, num_updates=10100, lr=0.000629317, gnorm=0.312, clip=0, loss_scale=4, train_wall=136, gb_free=60.5, wall=13794 epoch 006: 1699 / 1707 loss=3.854, nll_loss=2.329, ppl=5.02, wps=365685, ups=0.75, wpb=489445, bsz=16351.9, num_updates=10200, lr=0.000626224, gnorm=0.291, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=13927 epoch 006: 1699 / 1707 loss=3.854, nll_loss=2.329, ppl=5.02, wps=365685, ups=0.75, wpb=489445, bsz=16351.9, num_updates=10200, lr=0.000626224, gnorm=0.291, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=13927 epoch 006: 1699 / 1707 loss=3.854, nll_loss=2.329, ppl=5.02, wps=365685, ups=0.75, wpb=489445, bsz=16351.9, num_updates=10200, lr=0.000626224, gnorm=0.291, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=13927 epoch 006: 1699 / 1707 loss=3.854, nll_loss=2.329, ppl=5.02, wps=365685, ups=0.75, wpb=489445, bsz=16351.9, num_updates=10200, lr=0.000626224, gnorm=0.291, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=13927 epoch 006: 1699 / 1707 loss=3.854, nll_loss=2.329, ppl=5.02, wps=365685, ups=0.75, wpb=489445, bsz=16351.9, num_updates=10200, lr=0.000626224, gnorm=0.291, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=13927 epoch 006: 1699 / 1707 loss=3.854, nll_loss=2.329, ppl=5.02, wps=365685, ups=0.75, wpb=489445, bsz=16351.9, num_updates=10200, lr=0.000626224, gnorm=0.291, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=13927 end of epoch 6 (average epoch stats below) epoch 006 | loss 3.87 | nll_loss 2.346 | ppl 5.08 | wps 359655 | ups 0.73 | wpb 489892 | bsz 16332.3 | num_updates 10208 | lr 0.000625979 | gnorm 0.305 | clip 0 | loss_scale 4 | train_wall 2276 | gb_free 60.5 | wall 13937 epoch 006 | loss 3.87 | nll_loss 2.346 | ppl 5.08 | wps 359655 | ups 0.73 | wpb 489892 | bsz 16332.3 | num_updates 10208 | lr 0.000625979 | gnorm 0.305 | clip 0 | loss_scale 4 | train_wall 2276 | gb_free 60.5 | wall 13937 epoch 006 | loss 3.87 | nll_loss 2.346 | ppl 5.08 | wps 359655 | ups 0.73 | wpb 489892 | bsz 16332.3 | num_updates 10208 | lr 0.000625979 | gnorm 0.305 | clip 0 | loss_scale 4 | train_wall 2276 | gb_free 60.5 | wall 13937 epoch 006 | loss 3.87 | nll_loss 2.346 | ppl 5.08 | wps 359655 | ups 0.73 | wpb 489892 | bsz 16332.3 | num_updates 10208 | lr 0.000625979 | gnorm 0.305 | clip 0 | loss_scale 4 | train_wall 2276 | gb_free 60.5 | wall 13937 epoch 006 | loss 3.87 | nll_loss 2.346 | ppl 5.08 | wps 359655 | ups 0.73 | wpb 489892 | bsz 16332.3 | num_updates 10208 | lr 0.000625979 | gnorm 0.305 | clip 0 | loss_scale 4 | train_wall 2276 | gb_free 60.5 | wall 13937 epoch 006 | loss 3.87 | nll_loss 2.346 | ppl 5.08 | wps 359655 | ups 0.73 | wpb 489892 | bsz 16332.3 | num_updates 10208 | lr 0.000625979 | gnorm 0.305 | clip 0 | loss_scale 4 | train_wall 2276 | gb_free 60.5 | wall 13937 Start iterating over samples epoch 007: 92 / 1707 loss=3.824, nll_loss=2.295, ppl=4.91, wps=365907, ups=0.75, wpb=486378, bsz=15901.3, num_updates=10300, lr=0.000623177, gnorm=0.305, clip=0, loss_scale=4, train_wall=132, gb_free=60.6, wall=14060 epoch 007: 92 / 1707 loss=3.824, nll_loss=2.295, ppl=4.91, wps=365907, ups=0.75, wpb=486378, bsz=15901.3, num_updates=10300, lr=0.000623177, gnorm=0.305, clip=0, loss_scale=4, train_wall=132, gb_free=60.6, wall=14060 epoch 007: 92 / 1707 loss=3.824, nll_loss=2.295, ppl=4.91, wps=365907, ups=0.75, wpb=486378, bsz=15901.3, num_updates=10300, lr=0.000623177, gnorm=0.305, clip=0, loss_scale=4, train_wall=132, gb_free=60.6, wall=14060 epoch 007: 92 / 1707 loss=3.824, nll_loss=2.295, ppl=4.91, wps=365907, ups=0.75, wpb=486378, bsz=15901.3, num_updates=10300, lr=0.000623177, gnorm=0.305, clip=0, loss_scale=4, train_wall=132, gb_free=60.6, wall=14060 epoch 007: 92 / 1707 loss=3.824, nll_loss=2.295, ppl=4.91, wps=365907, ups=0.75, wpb=486378, bsz=15901.3, num_updates=10300, lr=0.000623177, gnorm=0.305, clip=0, loss_scale=4, train_wall=132, gb_free=60.6, wall=14060 epoch 007: 92 / 1707 loss=3.824, nll_loss=2.295, ppl=4.91, wps=365907, ups=0.75, wpb=486378, bsz=15901.3, num_updates=10300, lr=0.000623177, gnorm=0.305, clip=0, loss_scale=4, train_wall=132, gb_free=60.6, wall=14060 epoch 007: 92 / 1707 loss=3.824, nll_loss=2.295, ppl=4.91, wps=365907, ups=0.75, wpb=486378, bsz=15901.3, num_updates=10300, lr=0.000623177, gnorm=0.305, clip=0, loss_scale=4, train_wall=132, gb_free=60.6, wall=14060 epoch 007: 194 / 1707 loss=3.82, nll_loss=2.29, ppl=4.89, wps=360877, ups=0.74, wpb=490141, bsz=16446.6, num_updates=10400, lr=0.000620174, gnorm=0.293, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=14196 epoch 007: 194 / 1707 loss=3.82, nll_loss=2.29, ppl=4.89, wps=360877, ups=0.74, wpb=490141, bsz=16446.6, num_updates=10400, lr=0.000620174, gnorm=0.293, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=14196 epoch 007: 194 / 1707 loss=3.82, nll_loss=2.29, ppl=4.89, wps=360877, ups=0.74, wpb=490141, bsz=16446.6, num_updates=10400, lr=0.000620174, gnorm=0.293, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=14196 epoch 007: 194 / 1707 loss=3.82, nll_loss=2.29, ppl=4.89, wps=360877, ups=0.74, wpb=490141, bsz=16446.6, num_updates=10400, lr=0.000620174, gnorm=0.293, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=14196 epoch 007: 194 / 1707 loss=3.82, nll_loss=2.29, ppl=4.89, wps=360877, ups=0.74, wpb=490141, bsz=16446.6, num_updates=10400, lr=0.000620174, gnorm=0.293, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=14196 epoch 007: 194 / 1707 loss=3.82, nll_loss=2.29, ppl=4.89, wps=360877, ups=0.74, wpb=490141, bsz=16446.6, num_updates=10400, lr=0.000620174, gnorm=0.293, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=14196 epoch 007: 194 / 1707 loss=3.82, nll_loss=2.29, ppl=4.89, wps=360877, ups=0.74, wpb=490141, bsz=16446.6, num_updates=10400, lr=0.000620174, gnorm=0.293, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=14196 epoch 007: 294 / 1707 loss=3.816, nll_loss=2.286, ppl=4.88, wps=367724, ups=0.75, wpb=490353, bsz=16417.3, num_updates=10500, lr=0.000617213, gnorm=0.294, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=14330 epoch 007: 294 / 1707 loss=3.816, nll_loss=2.286, ppl=4.88, wps=367724, ups=0.75, wpb=490353, bsz=16417.3, num_updates=10500, lr=0.000617213, gnorm=0.294, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=14330 epoch 007: 294 / 1707 loss=3.816, nll_loss=2.286, ppl=4.88, wps=367724, ups=0.75, wpb=490353, bsz=16417.3, num_updates=10500, lr=0.000617213, gnorm=0.294, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=14330 epoch 007: 294 / 1707 loss=3.816, nll_loss=2.286, ppl=4.88, wps=367724, ups=0.75, wpb=490353, bsz=16417.3, num_updates=10500, lr=0.000617213, gnorm=0.294, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=14330 epoch 007: 294 / 1707 loss=3.816, nll_loss=2.286, ppl=4.88, wps=367724, ups=0.75, wpb=490353, bsz=16417.3, num_updates=10500, lr=0.000617213, gnorm=0.294, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=14330 epoch 007: 294 / 1707 loss=3.816, nll_loss=2.286, ppl=4.88, wps=367724, ups=0.75, wpb=490353, bsz=16417.3, num_updates=10500, lr=0.000617213, gnorm=0.294, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=14330 epoch 007: 294 / 1707 loss=3.816, nll_loss=2.286, ppl=4.88, wps=367724, ups=0.75, wpb=490353, bsz=16417.3, num_updates=10500, lr=0.000617213, gnorm=0.294, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=14330 epoch 007: 394 / 1707 loss=3.816, nll_loss=2.287, ppl=4.88, wps=369175, ups=0.75, wpb=491382, bsz=16104.2, num_updates=10600, lr=0.000614295, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=14463 epoch 007: 394 / 1707 loss=3.816, nll_loss=2.287, ppl=4.88, wps=369175, ups=0.75, wpb=491382, bsz=16104.2, num_updates=10600, lr=0.000614295, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=14463 epoch 007: 394 / 1707 loss=3.816, nll_loss=2.287, ppl=4.88, wps=369175, ups=0.75, wpb=491382, bsz=16104.2, num_updates=10600, lr=0.000614295, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=14463 epoch 007: 394 / 1707 loss=3.816, nll_loss=2.287, ppl=4.88, wps=369175, ups=0.75, wpb=491382, bsz=16104.2, num_updates=10600, lr=0.000614295, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=14463 epoch 007: 394 / 1707 loss=3.816, nll_loss=2.287, ppl=4.88, wps=369175, ups=0.75, wpb=491382, bsz=16104.2, num_updates=10600, lr=0.000614295, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=14463 epoch 007: 394 / 1707 loss=3.816, nll_loss=2.287, ppl=4.88, wps=369175, ups=0.75, wpb=491382, bsz=16104.2, num_updates=10600, lr=0.000614295, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=14463 epoch 007: 394 / 1707 loss=3.816, nll_loss=2.287, ppl=4.88, wps=369175, ups=0.75, wpb=491382, bsz=16104.2, num_updates=10600, lr=0.000614295, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=14463 epoch 007: 494 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=366391, ups=0.75, wpb=489966, bsz=16279, num_updates=10700, lr=0.000611418, gnorm=0.288, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=14596 epoch 007: 494 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=366391, ups=0.75, wpb=489966, bsz=16279, num_updates=10700, lr=0.000611418, gnorm=0.288, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=14596 epoch 007: 494 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=366391, ups=0.75, wpb=489966, bsz=16279, num_updates=10700, lr=0.000611418, gnorm=0.288, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=14596 epoch 007: 494 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=366391, ups=0.75, wpb=489966, bsz=16279, num_updates=10700, lr=0.000611418, gnorm=0.288, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=14596 epoch 007: 494 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=366391, ups=0.75, wpb=489966, bsz=16279, num_updates=10700, lr=0.000611418, gnorm=0.288, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=14596 epoch 007: 494 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=366391, ups=0.75, wpb=489966, bsz=16279, num_updates=10700, lr=0.000611418, gnorm=0.288, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=14596 epoch 007: 494 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=366391, ups=0.75, wpb=489966, bsz=16279, num_updates=10700, lr=0.000611418, gnorm=0.288, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=14596 epoch 007: 594 / 1707 loss=3.812, nll_loss=2.283, ppl=4.87, wps=367251, ups=0.75, wpb=491395, bsz=16602.3, num_updates=10800, lr=0.000608581, gnorm=0.283, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=14730 epoch 007: 594 / 1707 loss=3.812, nll_loss=2.283, ppl=4.87, wps=367251, ups=0.75, wpb=491395, bsz=16602.3, num_updates=10800, lr=0.000608581, gnorm=0.283, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=14730 epoch 007: 594 / 1707 loss=3.812, nll_loss=2.283, ppl=4.87, wps=367251, ups=0.75, wpb=491395, bsz=16602.3, num_updates=10800, lr=0.000608581, gnorm=0.283, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=14730 epoch 007: 594 / 1707 loss=3.812, nll_loss=2.283, ppl=4.87, wps=367251, ups=0.75, wpb=491395, bsz=16602.3, num_updates=10800, lr=0.000608581, gnorm=0.283, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=14730 epoch 007: 594 / 1707 loss=3.812, nll_loss=2.283, ppl=4.87, wps=367251, ups=0.75, wpb=491395, bsz=16602.3, num_updates=10800, lr=0.000608581, gnorm=0.283, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=14730 epoch 007: 594 / 1707 loss=3.812, nll_loss=2.283, ppl=4.87, wps=367251, ups=0.75, wpb=491395, bsz=16602.3, num_updates=10800, lr=0.000608581, gnorm=0.283, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=14730 epoch 007: 594 / 1707 loss=3.812, nll_loss=2.283, ppl=4.87, wps=367251, ups=0.75, wpb=491395, bsz=16602.3, num_updates=10800, lr=0.000608581, gnorm=0.283, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=14730 epoch 007: 694 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=365997, ups=0.75, wpb=489858, bsz=16432.5, num_updates=10900, lr=0.000605783, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.1, wall=14864 epoch 007: 694 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=365997, ups=0.75, wpb=489858, bsz=16432.5, num_updates=10900, lr=0.000605783, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.1, wall=14864 epoch 007: 694 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=365997, ups=0.75, wpb=489858, bsz=16432.5, num_updates=10900, lr=0.000605783, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.1, wall=14864 epoch 007: 694 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=365997, ups=0.75, wpb=489858, bsz=16432.5, num_updates=10900, lr=0.000605783, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.1, wall=14864 epoch 007: 694 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=365997, ups=0.75, wpb=489858, bsz=16432.5, num_updates=10900, lr=0.000605783, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.1, wall=14864 epoch 007: 694 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=365997, ups=0.75, wpb=489858, bsz=16432.5, num_updates=10900, lr=0.000605783, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.1, wall=14864 epoch 007: 694 / 1707 loss=3.818, nll_loss=2.289, ppl=4.89, wps=365997, ups=0.75, wpb=489858, bsz=16432.5, num_updates=10900, lr=0.000605783, gnorm=0.303, clip=0, loss_scale=8, train_wall=133, gb_free=60.1, wall=14864 epoch 007: 794 / 1707 loss=3.816, nll_loss=2.288, ppl=4.88, wps=366844, ups=0.75, wpb=490041, bsz=16426.4, num_updates=11000, lr=0.000603023, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=61.4, wall=14998 epoch 007: 794 / 1707 loss=3.816, nll_loss=2.288, ppl=4.88, wps=366844, ups=0.75, wpb=490041, bsz=16426.4, num_updates=11000, lr=0.000603023, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=61.4, wall=14998 epoch 007: 794 / 1707 loss=3.816, nll_loss=2.288, ppl=4.88, wps=366844, ups=0.75, wpb=490041, bsz=16426.4, num_updates=11000, lr=0.000603023, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=61.4, wall=14998 epoch 007: 794 / 1707 loss=3.816, nll_loss=2.288, ppl=4.88, wps=366844, ups=0.75, wpb=490041, bsz=16426.4, num_updates=11000, lr=0.000603023, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=61.4, wall=14998 epoch 007: 794 / 1707 loss=3.816, nll_loss=2.288, ppl=4.88, wps=366844, ups=0.75, wpb=490041, bsz=16426.4, num_updates=11000, lr=0.000603023, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=61.4, wall=14998 epoch 007: 794 / 1707 loss=3.816, nll_loss=2.288, ppl=4.88, wps=366844, ups=0.75, wpb=490041, bsz=16426.4, num_updates=11000, lr=0.000603023, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=61.4, wall=14998 epoch 007: 794 / 1707 loss=3.816, nll_loss=2.288, ppl=4.88, wps=366844, ups=0.75, wpb=490041, bsz=16426.4, num_updates=11000, lr=0.000603023, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=61.4, wall=14998 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 3.935 | nll_loss 2.394 | ppl 5.26 | wps 222239 | wpb 22263 | bsz 1004 | num_updates 11000 | best_loss 3.935 epoch 007 | valid on 'valid' subset | loss 3.935 | nll_loss 2.394 | ppl 5.26 | wps 222239 | wpb 22263 | bsz 1004 | num_updates 11000 | best_loss 3.935 epoch 007 | valid on 'valid' subset | loss 3.935 | nll_loss 2.394 | ppl 5.26 | wps 222239 | wpb 22263 | bsz 1004 | num_updates 11000 | best_loss 3.935 epoch 007 | valid on 'valid' subset | loss 3.935 | nll_loss 2.394 | ppl 5.26 | wps 222239 | wpb 22263 | bsz 1004 | num_updates 11000 | best_loss 3.935 epoch 007 | valid on 'valid' subset | loss 3.935 | nll_loss 2.394 | ppl 5.26 | wps 222239 | wpb 22263 | bsz 1004 | num_updates 11000 | best_loss 3.935 epoch 007 | valid on 'valid' subset | loss 3.935 | nll_loss 2.394 | ppl 5.26 | wps 222239 | wpb 22263 | bsz 1004 | num_updates 11000 | best_loss 3.935 epoch 007 | valid on 'valid' subset | loss 3.935 | nll_loss 2.394 | ppl 5.26 | wps 222239 | wpb 22263 | bsz 1004 | num_updates 11000 | best_loss 3.935 epoch 007: 895 / 1707 loss=3.822, nll_loss=2.294, ppl=4.9, wps=319310, ups=0.65, wpb=490120, bsz=16419.6, num_updates=11100, lr=0.0006003, gnorm=0.291, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=15151 epoch 007: 895 / 1707 loss=3.822, nll_loss=2.294, ppl=4.9, wps=319310, ups=0.65, wpb=490120, bsz=16419.6, num_updates=11100, lr=0.0006003, gnorm=0.291, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=15151 epoch 007: 895 / 1707 loss=3.822, nll_loss=2.294, ppl=4.9, wps=319310, ups=0.65, wpb=490120, bsz=16419.6, num_updates=11100, lr=0.0006003, gnorm=0.291, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=15151 epoch 007: 895 / 1707 loss=3.822, nll_loss=2.294, ppl=4.9, wps=319310, ups=0.65, wpb=490120, bsz=16419.6, num_updates=11100, lr=0.0006003, gnorm=0.291, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=15151 epoch 007: 895 / 1707 loss=3.822, nll_loss=2.294, ppl=4.9, wps=319310, ups=0.65, wpb=490120, bsz=16419.6, num_updates=11100, lr=0.0006003, gnorm=0.291, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=15151 epoch 007: 895 / 1707 loss=3.822, nll_loss=2.294, ppl=4.9, wps=319310, ups=0.65, wpb=490120, bsz=16419.6, num_updates=11100, lr=0.0006003, gnorm=0.291, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=15151 epoch 007: 895 / 1707 loss=3.822, nll_loss=2.294, ppl=4.9, wps=319310, ups=0.65, wpb=490120, bsz=16419.6, num_updates=11100, lr=0.0006003, gnorm=0.291, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=15151 epoch 007: 995 / 1707 loss=3.812, nll_loss=2.284, ppl=4.87, wps=366429, ups=0.75, wpb=489577, bsz=16211.3, num_updates=11200, lr=0.000597614, gnorm=0.295, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=15285 epoch 007: 995 / 1707 loss=3.812, nll_loss=2.284, ppl=4.87, wps=366429, ups=0.75, wpb=489577, bsz=16211.3, num_updates=11200, lr=0.000597614, gnorm=0.295, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=15285 epoch 007: 995 / 1707 loss=3.812, nll_loss=2.284, ppl=4.87, wps=366429, ups=0.75, wpb=489577, bsz=16211.3, num_updates=11200, lr=0.000597614, gnorm=0.295, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=15285 epoch 007: 995 / 1707 loss=3.812, nll_loss=2.284, ppl=4.87, wps=366429, ups=0.75, wpb=489577, bsz=16211.3, num_updates=11200, lr=0.000597614, gnorm=0.295, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=15285 epoch 007: 995 / 1707 loss=3.812, nll_loss=2.284, ppl=4.87, wps=366429, ups=0.75, wpb=489577, bsz=16211.3, num_updates=11200, lr=0.000597614, gnorm=0.295, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=15285 epoch 007: 995 / 1707 loss=3.812, nll_loss=2.284, ppl=4.87, wps=366429, ups=0.75, wpb=489577, bsz=16211.3, num_updates=11200, lr=0.000597614, gnorm=0.295, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=15285 epoch 007: 995 / 1707 loss=3.812, nll_loss=2.284, ppl=4.87, wps=366429, ups=0.75, wpb=489577, bsz=16211.3, num_updates=11200, lr=0.000597614, gnorm=0.295, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=15285 epoch 007: 1095 / 1707 loss=3.815, nll_loss=2.287, ppl=4.88, wps=366042, ups=0.75, wpb=489730, bsz=16304.6, num_updates=11300, lr=0.000594964, gnorm=0.294, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=15419 epoch 007: 1095 / 1707 loss=3.815, nll_loss=2.287, ppl=4.88, wps=366042, ups=0.75, wpb=489730, bsz=16304.6, num_updates=11300, lr=0.000594964, gnorm=0.294, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=15419 epoch 007: 1095 / 1707 loss=3.815, nll_loss=2.287, ppl=4.88, wps=366042, ups=0.75, wpb=489730, bsz=16304.6, num_updates=11300, lr=0.000594964, gnorm=0.294, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=15419 epoch 007: 1095 / 1707 loss=3.815, nll_loss=2.287, ppl=4.88, wps=366042, ups=0.75, wpb=489730, bsz=16304.6, num_updates=11300, lr=0.000594964, gnorm=0.294, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=15419 epoch 007: 1095 / 1707 loss=3.815, nll_loss=2.287, ppl=4.88, wps=366042, ups=0.75, wpb=489730, bsz=16304.6, num_updates=11300, lr=0.000594964, gnorm=0.294, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=15419 epoch 007: 1095 / 1707 loss=3.815, nll_loss=2.287, ppl=4.88, wps=366042, ups=0.75, wpb=489730, bsz=16304.6, num_updates=11300, lr=0.000594964, gnorm=0.294, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=15419 epoch 007: 1095 / 1707 loss=3.815, nll_loss=2.287, ppl=4.88, wps=366042, ups=0.75, wpb=489730, bsz=16304.6, num_updates=11300, lr=0.000594964, gnorm=0.294, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=15419 epoch 007: 1196 / 1707 loss=3.811, nll_loss=2.282, ppl=4.86, wps=364404, ups=0.74, wpb=491327, bsz=16343.2, num_updates=11400, lr=0.000592349, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=15553 epoch 007: 1196 / 1707 loss=3.811, nll_loss=2.282, ppl=4.86, wps=364404, ups=0.74, wpb=491327, bsz=16343.2, num_updates=11400, lr=0.000592349, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=15553 epoch 007: 1196 / 1707 loss=3.811, nll_loss=2.282, ppl=4.86, wps=364404, ups=0.74, wpb=491327, bsz=16343.2, num_updates=11400, lr=0.000592349, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=15553 epoch 007: 1196 / 1707 loss=3.811, nll_loss=2.282, ppl=4.86, wps=364404, ups=0.74, wpb=491327, bsz=16343.2, num_updates=11400, lr=0.000592349, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=15553 epoch 007: 1196 / 1707 loss=3.811, nll_loss=2.282, ppl=4.86, wps=364404, ups=0.74, wpb=491327, bsz=16343.2, num_updates=11400, lr=0.000592349, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=15553 epoch 007: 1196 / 1707 loss=3.811, nll_loss=2.282, ppl=4.86, wps=364404, ups=0.74, wpb=491327, bsz=16343.2, num_updates=11400, lr=0.000592349, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=15553 epoch 007: 1196 / 1707 loss=3.811, nll_loss=2.282, ppl=4.86, wps=364404, ups=0.74, wpb=491327, bsz=16343.2, num_updates=11400, lr=0.000592349, gnorm=0.298, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=15553 epoch 007: 1296 / 1707 loss=3.808, nll_loss=2.279, ppl=4.85, wps=364999, ups=0.75, wpb=489790, bsz=16306.6, num_updates=11500, lr=0.000589768, gnorm=0.289, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=15688 epoch 007: 1296 / 1707 loss=3.808, nll_loss=2.279, ppl=4.85, wps=364999, ups=0.75, wpb=489790, bsz=16306.6, num_updates=11500, lr=0.000589768, gnorm=0.289, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=15688 epoch 007: 1296 / 1707 loss=3.808, nll_loss=2.279, ppl=4.85, wps=364999, ups=0.75, wpb=489790, bsz=16306.6, num_updates=11500, lr=0.000589768, gnorm=0.289, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=15688 epoch 007: 1296 / 1707 loss=3.808, nll_loss=2.279, ppl=4.85, wps=364999, ups=0.75, wpb=489790, bsz=16306.6, num_updates=11500, lr=0.000589768, gnorm=0.289, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=15688 epoch 007: 1296 / 1707 loss=3.808, nll_loss=2.279, ppl=4.85, wps=364999, ups=0.75, wpb=489790, bsz=16306.6, num_updates=11500, lr=0.000589768, gnorm=0.289, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=15688 epoch 007: 1296 / 1707 loss=3.808, nll_loss=2.279, ppl=4.85, wps=364999, ups=0.75, wpb=489790, bsz=16306.6, num_updates=11500, lr=0.000589768, gnorm=0.289, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=15688 epoch 007: 1296 / 1707 loss=3.808, nll_loss=2.279, ppl=4.85, wps=364999, ups=0.75, wpb=489790, bsz=16306.6, num_updates=11500, lr=0.000589768, gnorm=0.289, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=15688 epoch 007: 1396 / 1707 loss=3.806, nll_loss=2.277, ppl=4.85, wps=364996, ups=0.75, wpb=489794, bsz=16236.6, num_updates=11600, lr=0.00058722, gnorm=0.284, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=15822 epoch 007: 1396 / 1707 loss=3.806, nll_loss=2.277, ppl=4.85, wps=364996, ups=0.75, wpb=489794, bsz=16236.6, num_updates=11600, lr=0.00058722, gnorm=0.284, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=15822 epoch 007: 1396 / 1707 loss=3.806, nll_loss=2.277, ppl=4.85, wps=364996, ups=0.75, wpb=489794, bsz=16236.6, num_updates=11600, lr=0.00058722, gnorm=0.284, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=15822 epoch 007: 1396 / 1707 loss=3.806, nll_loss=2.277, ppl=4.85, wps=364996, ups=0.75, wpb=489794, bsz=16236.6, num_updates=11600, lr=0.00058722, gnorm=0.284, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=15822 epoch 007: 1396 / 1707 loss=3.806, nll_loss=2.277, ppl=4.85, wps=364996, ups=0.75, wpb=489794, bsz=16236.6, num_updates=11600, lr=0.00058722, gnorm=0.284, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=15822 epoch 007: 1396 / 1707 loss=3.806, nll_loss=2.277, ppl=4.85, wps=364996, ups=0.75, wpb=489794, bsz=16236.6, num_updates=11600, lr=0.00058722, gnorm=0.284, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=15822 epoch 007: 1396 / 1707 loss=3.806, nll_loss=2.277, ppl=4.85, wps=364996, ups=0.75, wpb=489794, bsz=16236.6, num_updates=11600, lr=0.00058722, gnorm=0.284, clip=0, loss_scale=8, train_wall=134, gb_free=60.4, wall=15822 epoch 007: 1497 / 1707 loss=3.805, nll_loss=2.276, ppl=4.84, wps=361343, ups=0.74, wpb=489805, bsz=16378.2, num_updates=11700, lr=0.000584705, gnorm=0.292, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=15957 epoch 007: 1497 / 1707 loss=3.805, nll_loss=2.276, ppl=4.84, wps=361343, ups=0.74, wpb=489805, bsz=16378.2, num_updates=11700, lr=0.000584705, gnorm=0.292, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=15957 epoch 007: 1497 / 1707 loss=3.805, nll_loss=2.276, ppl=4.84, wps=361343, ups=0.74, wpb=489805, bsz=16378.2, num_updates=11700, lr=0.000584705, gnorm=0.292, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=15957 epoch 007: 1497 / 1707 loss=3.805, nll_loss=2.276, ppl=4.84, wps=361343, ups=0.74, wpb=489805, bsz=16378.2, num_updates=11700, lr=0.000584705, gnorm=0.292, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=15957 epoch 007: 1497 / 1707 loss=3.805, nll_loss=2.276, ppl=4.84, wps=361343, ups=0.74, wpb=489805, bsz=16378.2, num_updates=11700, lr=0.000584705, gnorm=0.292, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=15957 epoch 007: 1497 / 1707 loss=3.805, nll_loss=2.276, ppl=4.84, wps=361343, ups=0.74, wpb=489805, bsz=16378.2, num_updates=11700, lr=0.000584705, gnorm=0.292, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=15957 epoch 007: 1497 / 1707 loss=3.805, nll_loss=2.276, ppl=4.84, wps=361343, ups=0.74, wpb=489805, bsz=16378.2, num_updates=11700, lr=0.000584705, gnorm=0.292, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=15957 epoch 007: 1597 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=363728, ups=0.74, wpb=489055, bsz=16473.3, num_updates=11800, lr=0.000582223, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=16092 epoch 007: 1597 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=363728, ups=0.74, wpb=489055, bsz=16473.3, num_updates=11800, lr=0.000582223, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=16092 epoch 007: 1597 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=363728, ups=0.74, wpb=489055, bsz=16473.3, num_updates=11800, lr=0.000582223, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=16092 epoch 007: 1597 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=363728, ups=0.74, wpb=489055, bsz=16473.3, num_updates=11800, lr=0.000582223, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=16092 epoch 007: 1597 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=363728, ups=0.74, wpb=489055, bsz=16473.3, num_updates=11800, lr=0.000582223, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=16092 epoch 007: 1597 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=363728, ups=0.74, wpb=489055, bsz=16473.3, num_updates=11800, lr=0.000582223, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=16092 epoch 007: 1597 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=363728, ups=0.74, wpb=489055, bsz=16473.3, num_updates=11800, lr=0.000582223, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=16092 epoch 007: 1697 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=365522, ups=0.75, wpb=490004, bsz=16359.4, num_updates=11900, lr=0.000579771, gnorm=0.281, clip=0, loss_scale=8, train_wall=134, gb_free=60.8, wall=16226 epoch 007: 1697 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=365522, ups=0.75, wpb=490004, bsz=16359.4, num_updates=11900, lr=0.000579771, gnorm=0.281, clip=0, loss_scale=8, train_wall=134, gb_free=60.8, wall=16226 epoch 007: 1697 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=365522, ups=0.75, wpb=490004, bsz=16359.4, num_updates=11900, lr=0.000579771, gnorm=0.281, clip=0, loss_scale=8, train_wall=134, gb_free=60.8, wall=16226 epoch 007: 1697 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=365522, ups=0.75, wpb=490004, bsz=16359.4, num_updates=11900, lr=0.000579771, gnorm=0.281, clip=0, loss_scale=8, train_wall=134, gb_free=60.8, wall=16226 epoch 007: 1697 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=365522, ups=0.75, wpb=490004, bsz=16359.4, num_updates=11900, lr=0.000579771, gnorm=0.281, clip=0, loss_scale=8, train_wall=134, gb_free=60.8, wall=16226 epoch 007: 1697 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=365522, ups=0.75, wpb=490004, bsz=16359.4, num_updates=11900, lr=0.000579771, gnorm=0.281, clip=0, loss_scale=8, train_wall=134, gb_free=60.8, wall=16226 epoch 007: 1697 / 1707 loss=3.807, nll_loss=2.279, ppl=4.85, wps=365522, ups=0.75, wpb=490004, bsz=16359.4, num_updates=11900, lr=0.000579771, gnorm=0.281, clip=0, loss_scale=8, train_wall=134, gb_free=60.8, wall=16226 end of epoch 7 (average epoch stats below) epoch 007 | loss 3.814 | nll_loss 2.285 | ppl 4.87 | wps 362332 | ups 0.74 | wpb 489913 | bsz 16332.2 | num_updates 11910 | lr 0.000579528 | gnorm 0.29 | clip 0 | loss_scale 8 | train_wall 2274 | gb_free 61.4 | wall 16238 epoch 007 | loss 3.814 | nll_loss 2.285 | ppl 4.87 | wps 362332 | ups 0.74 | wpb 489913 | bsz 16332.2 | num_updates 11910 | lr 0.000579528 | gnorm 0.29 | clip 0 | loss_scale 8 | train_wall 2274 | gb_free 61.4 | wall 16238 epoch 007 | loss 3.814 | nll_loss 2.285 | ppl 4.87 | wps 362332 | ups 0.74 | wpb 489913 | bsz 16332.2 | num_updates 11910 | lr 0.000579528 | gnorm 0.29 | clip 0 | loss_scale 8 | train_wall 2274 | gb_free 61.4 | wall 16238 epoch 007 | loss 3.814 | nll_loss 2.285 | ppl 4.87 | wps 362332 | ups 0.74 | wpb 489913 | bsz 16332.2 | num_updates 11910 | lr 0.000579528 | gnorm 0.29 | clip 0 | loss_scale 8 | train_wall 2274 | gb_free 61.4 | wall 16238 epoch 007 | loss 3.814 | nll_loss 2.285 | ppl 4.87 | wps 362332 | ups 0.74 | wpb 489913 | bsz 16332.2 | num_updates 11910 | lr 0.000579528 | gnorm 0.29 | clip 0 | loss_scale 8 | train_wall 2274 | gb_free 61.4 | wall 16238 epoch 007 | loss 3.814 | nll_loss 2.285 | ppl 4.87 | wps 362332 | ups 0.74 | wpb 489913 | bsz 16332.2 | num_updates 11910 | lr 0.000579528 | gnorm 0.29 | clip 0 | loss_scale 8 | train_wall 2274 | gb_free 61.4 | wall 16238 epoch 007 | loss 3.814 | nll_loss 2.285 | ppl 4.87 | wps 362332 | ups 0.74 | wpb 489913 | bsz 16332.2 | num_updates 11910 | lr 0.000579528 | gnorm 0.29 | clip 0 | loss_scale 8 | train_wall 2274 | gb_free 61.4 | wall 16238 Start iterating over samples epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 epoch 008: 91 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=362399, ups=0.74, wpb=487143, bsz=16124.6, num_updates=12000, lr=0.00057735, gnorm=0.297, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=16360 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008 | valid on 'valid' subset | loss 3.89 | nll_loss 2.345 | ppl 5.08 | wps 216571 | wpb 22263 | bsz 1004 | num_updates 12000 | best_loss 3.89 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 192 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=324978, ups=0.66, wpb=489967, bsz=16070.5, num_updates=12100, lr=0.00057496, gnorm=0.276, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=16511 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 292 / 1707 loss=3.773, nll_loss=2.24, ppl=4.72, wps=368133, ups=0.75, wpb=490532, bsz=16304.2, num_updates=12200, lr=0.000572598, gnorm=0.274, clip=0, loss_scale=2, train_wall=133, gb_free=61.1, wall=16644 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 392 / 1707 loss=3.772, nll_loss=2.239, ppl=4.72, wps=366348, ups=0.75, wpb=489896, bsz=16431, num_updates=12300, lr=0.000570266, gnorm=0.278, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=16778 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 492 / 1707 loss=3.775, nll_loss=2.242, ppl=4.73, wps=365287, ups=0.75, wpb=490217, bsz=16489.1, num_updates=12400, lr=0.000567962, gnorm=0.295, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=16912 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 592 / 1707 loss=3.771, nll_loss=2.238, ppl=4.72, wps=367121, ups=0.75, wpb=490839, bsz=16333.8, num_updates=12500, lr=0.000565685, gnorm=0.292, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=17046 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 693 / 1707 loss=3.774, nll_loss=2.241, ppl=4.73, wps=361979, ups=0.74, wpb=490728, bsz=16525.2, num_updates=12600, lr=0.000563436, gnorm=0.283, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=17181 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 793 / 1707 loss=3.775, nll_loss=2.243, ppl=4.74, wps=366638, ups=0.75, wpb=489355, bsz=16326.7, num_updates=12700, lr=0.000561214, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=17315 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 893 / 1707 loss=3.772, nll_loss=2.24, ppl=4.72, wps=365195, ups=0.75, wpb=489992, bsz=16224, num_updates=12800, lr=0.000559017, gnorm=0.282, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=17449 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 993 / 1707 loss=3.774, nll_loss=2.243, ppl=4.73, wps=365393, ups=0.75, wpb=490343, bsz=16372.5, num_updates=12900, lr=0.000556846, gnorm=0.273, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=17583 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 epoch 008: 1093 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=365301, ups=0.75, wpb=489986, bsz=16307.3, num_updates=13000, lr=0.0005547, gnorm=0.276, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=17717 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008 | valid on 'valid' subset | loss 3.889 | nll_loss 2.344 | ppl 5.08 | wps 218829 | wpb 22263 | bsz 1004 | num_updates 13000 | best_loss 3.889 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1194 / 1707 loss=3.776, nll_loss=2.245, ppl=4.74, wps=307085, ups=0.63, wpb=489346, bsz=16343.4, num_updates=13100, lr=0.000552579, gnorm=0.278, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=17877 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1294 / 1707 loss=3.773, nll_loss=2.241, ppl=4.73, wps=366731, ups=0.75, wpb=489945, bsz=16258.6, num_updates=13200, lr=0.000550482, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=18010 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1394 / 1707 loss=3.771, nll_loss=2.24, ppl=4.72, wps=367318, ups=0.75, wpb=490499, bsz=16499.1, num_updates=13300, lr=0.000548408, gnorm=0.282, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=18144 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1494 / 1707 loss=3.768, nll_loss=2.237, ppl=4.71, wps=368648, ups=0.75, wpb=490534, bsz=16269.7, num_updates=13400, lr=0.000546358, gnorm=0.28, clip=0, loss_scale=8, train_wall=133, gb_free=60.6, wall=18277 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1595 / 1707 loss=3.771, nll_loss=2.24, ppl=4.73, wps=362231, ups=0.74, wpb=489403, bsz=16403.3, num_updates=13500, lr=0.000544331, gnorm=0.273, clip=0, loss_scale=4, train_wall=135, gb_free=60.7, wall=18412 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 epoch 008: 1695 / 1707 loss=3.77, nll_loss=2.239, ppl=4.72, wps=365523, ups=0.75, wpb=489475, bsz=16219.4, num_updates=13600, lr=0.000542326, gnorm=0.279, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=18546 end of epoch 8 (average epoch stats below) epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 epoch 008 | loss 3.772 | nll_loss 2.24 | ppl 4.72 | wps 358939 | ups 0.73 | wpb 489905 | bsz 16332 | num_updates 13612 | lr 0.000542087 | gnorm 0.279 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 61.8 | wall 18561 Start iterating over samples epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 88 / 1707 loss=3.734, nll_loss=2.198, ppl=4.59, wps=363285, ups=0.75, wpb=486230, bsz=16464.4, num_updates=13700, lr=0.000540343, gnorm=0.282, clip=0, loss_scale=8, train_wall=133, gb_free=60.3, wall=18680 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 189 / 1707 loss=3.733, nll_loss=2.196, ppl=4.58, wps=364179, ups=0.74, wpb=490207, bsz=16251.4, num_updates=13800, lr=0.000538382, gnorm=0.272, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=18814 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 289 / 1707 loss=3.738, nll_loss=2.202, ppl=4.6, wps=365822, ups=0.75, wpb=489599, bsz=16418.8, num_updates=13900, lr=0.000536442, gnorm=0.273, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=18948 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 epoch 009: 389 / 1707 loss=3.736, nll_loss=2.2, ppl=4.59, wps=366899, ups=0.75, wpb=488981, bsz=16010.2, num_updates=14000, lr=0.000534522, gnorm=0.266, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=19082 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009 | valid on 'valid' subset | loss 3.865 | nll_loss 2.321 | ppl 5 | wps 222285 | wpb 22263 | bsz 1004 | num_updates 14000 | best_loss 3.865 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 490 / 1707 loss=3.743, nll_loss=2.208, ppl=4.62, wps=324723, ups=0.66, wpb=490110, bsz=16485, num_updates=14100, lr=0.000532624, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=19233 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 590 / 1707 loss=3.741, nll_loss=2.206, ppl=4.61, wps=367359, ups=0.75, wpb=489859, bsz=16168.5, num_updates=14200, lr=0.000530745, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=19366 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 690 / 1707 loss=3.745, nll_loss=2.21, ppl=4.63, wps=367535, ups=0.75, wpb=489624, bsz=16234.3, num_updates=14300, lr=0.000528886, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19499 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 790 / 1707 loss=3.744, nll_loss=2.209, ppl=4.62, wps=367651, ups=0.75, wpb=490125, bsz=16445.4, num_updates=14400, lr=0.000527046, gnorm=0.276, clip=0, loss_scale=8, train_wall=133, gb_free=60.8, wall=19632 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 891 / 1707 loss=3.746, nll_loss=2.212, ppl=4.63, wps=362870, ups=0.74, wpb=490640, bsz=16432.2, num_updates=14500, lr=0.000525226, gnorm=0.268, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=19768 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 991 / 1707 loss=3.748, nll_loss=2.214, ppl=4.64, wps=365834, ups=0.75, wpb=489536, bsz=16626.2, num_updates=14600, lr=0.000523424, gnorm=0.277, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=19901 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1092 / 1707 loss=3.745, nll_loss=2.211, ppl=4.63, wps=360185, ups=0.74, wpb=489454, bsz=16361.4, num_updates=14700, lr=0.000521641, gnorm=0.289, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=20037 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1192 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=367212, ups=0.75, wpb=490441, bsz=16298.6, num_updates=14800, lr=0.000519875, gnorm=0.266, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=20171 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1292 / 1707 loss=3.744, nll_loss=2.21, ppl=4.63, wps=365324, ups=0.74, wpb=490768, bsz=16304.3, num_updates=14900, lr=0.000518128, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=20305 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 epoch 009: 1392 / 1707 loss=3.739, nll_loss=2.205, ppl=4.61, wps=367634, ups=0.75, wpb=490929, bsz=16317.8, num_updates=15000, lr=0.000516398, gnorm=0.272, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=20439 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009 | valid on 'valid' subset | loss 3.858 | nll_loss 2.316 | ppl 4.98 | wps 219520 | wpb 22263 | bsz 1004 | num_updates 15000 | best_loss 3.858 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1493 / 1707 loss=3.736, nll_loss=2.202, ppl=4.6, wps=325149, ups=0.66, wpb=490913, bsz=16076, num_updates=15100, lr=0.000514685, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=20590 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1593 / 1707 loss=3.741, nll_loss=2.208, ppl=4.62, wps=367843, ups=0.75, wpb=490741, bsz=16506.4, num_updates=15200, lr=0.000512989, gnorm=0.283, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=20723 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 epoch 009: 1693 / 1707 loss=3.747, nll_loss=2.214, ppl=4.64, wps=367184, ups=0.75, wpb=490602, bsz=16398.4, num_updates=15300, lr=0.00051131, gnorm=0.274, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=20857 end of epoch 9 (average epoch stats below) epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 epoch 009 | loss 3.741 | nll_loss 2.206 | ppl 4.62 | wps 360488 | ups 0.74 | wpb 489921 | bsz 16334.3 | num_updates 15314 | lr 0.000511076 | gnorm 0.274 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 20874 Start iterating over samples epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 86 / 1707 loss=3.71, nll_loss=2.171, ppl=4.5, wps=366181, ups=0.75, wpb=486361, bsz=16110.9, num_updates=15400, lr=0.000509647, gnorm=0.272, clip=0, loss_scale=4, train_wall=132, gb_free=60.7, wall=20990 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 186 / 1707 loss=3.711, nll_loss=2.172, ppl=4.51, wps=366355, ups=0.75, wpb=489305, bsz=16138.8, num_updates=15500, lr=0.000508001, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=21123 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 287 / 1707 loss=3.712, nll_loss=2.173, ppl=4.51, wps=364355, ups=0.74, wpb=491332, bsz=16226.9, num_updates=15600, lr=0.00050637, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=21258 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 388 / 1707 loss=3.71, nll_loss=2.172, ppl=4.51, wps=364017, ups=0.74, wpb=491194, bsz=16551.3, num_updates=15700, lr=0.000504754, gnorm=0.27, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=21393 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 489 / 1707 loss=3.712, nll_loss=2.175, ppl=4.51, wps=362546, ups=0.74, wpb=489722, bsz=16163.7, num_updates=15800, lr=0.000503155, gnorm=0.285, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=21528 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 589 / 1707 loss=3.718, nll_loss=2.182, ppl=4.54, wps=366472, ups=0.75, wpb=490108, bsz=16681.3, num_updates=15900, lr=0.00050157, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=21662 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 epoch 010: 689 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366773, ups=0.75, wpb=489652, bsz=16486.7, num_updates=16000, lr=0.0005, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=21795 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.839 | nll_loss 2.296 | ppl 4.91 | wps 222701 | wpb 22263 | bsz 1004 | num_updates 16000 | best_loss 3.839 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 789 / 1707 loss=3.716, nll_loss=2.179, ppl=4.53, wps=319727, ups=0.65, wpb=490173, bsz=16201, num_updates=16100, lr=0.000498445, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=21949 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 889 / 1707 loss=3.713, nll_loss=2.176, ppl=4.52, wps=369106, ups=0.75, wpb=491161, bsz=16460.8, num_updates=16200, lr=0.000496904, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=22082 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 989 / 1707 loss=3.721, nll_loss=2.185, ppl=4.55, wps=367848, ups=0.75, wpb=490418, bsz=16244.5, num_updates=16300, lr=0.000495377, gnorm=0.268, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22215 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1089 / 1707 loss=3.718, nll_loss=2.181, ppl=4.54, wps=366896, ups=0.75, wpb=488674, bsz=16292.1, num_updates=16400, lr=0.000493865, gnorm=0.261, clip=0, loss_scale=4, train_wall=133, gb_free=59.9, wall=22348 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1190 / 1707 loss=3.717, nll_loss=2.181, ppl=4.53, wps=361993, ups=0.74, wpb=490027, bsz=16153.5, num_updates=16500, lr=0.000492366, gnorm=0.271, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=22484 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1290 / 1707 loss=3.716, nll_loss=2.18, ppl=4.53, wps=366468, ups=0.75, wpb=490830, bsz=16534.9, num_updates=16600, lr=0.000490881, gnorm=0.267, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=22617 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1390 / 1707 loss=3.713, nll_loss=2.177, ppl=4.52, wps=365704, ups=0.75, wpb=489736, bsz=16489.7, num_updates=16700, lr=0.000489409, gnorm=0.282, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=22751 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1490 / 1707 loss=3.72, nll_loss=2.184, ppl=4.55, wps=365792, ups=0.75, wpb=490006, bsz=16392.3, num_updates=16800, lr=0.00048795, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=22885 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1590 / 1707 loss=3.72, nll_loss=2.184, ppl=4.54, wps=366149, ups=0.75, wpb=490023, bsz=16398.6, num_updates=16900, lr=0.000486504, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=23019 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 epoch 010: 1690 / 1707 loss=3.718, nll_loss=2.183, ppl=4.54, wps=365230, ups=0.75, wpb=489346, bsz=16192.7, num_updates=17000, lr=0.000485071, gnorm=0.274, clip=0, loss_scale=8, train_wall=133, gb_free=60.4, wall=23153 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 epoch 010 | valid on 'valid' subset | loss 3.848 | nll_loss 2.307 | ppl 4.95 | wps 219811 | wpb 22263 | bsz 1004 | num_updates 17000 | best_loss 3.839 end of epoch 10 (average epoch stats below) epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 epoch 010 | loss 3.716 | nll_loss 2.179 | ppl 4.53 | wps 360737 | ups 0.74 | wpb 489886 | bsz 16332.9 | num_updates 17017 | lr 0.000484829 | gnorm 0.269 | clip 0 | loss_scale 8 | train_wall 2272 | gb_free 60.9 | wall 23187 Start iterating over samples epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 84 / 1707 loss=3.687, nll_loss=2.147, ppl=4.43, wps=333301, ups=0.69, wpb=486490, bsz=16225.4, num_updates=17100, lr=0.000483651, gnorm=0.27, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=23299 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 184 / 1707 loss=3.685, nll_loss=2.144, ppl=4.42, wps=367206, ups=0.75, wpb=489195, bsz=16107.4, num_updates=17200, lr=0.000482243, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=23432 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 285 / 1707 loss=3.69, nll_loss=2.15, ppl=4.44, wps=362788, ups=0.74, wpb=489649, bsz=16461.9, num_updates=17300, lr=0.000480847, gnorm=0.273, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23567 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 385 / 1707 loss=3.691, nll_loss=2.151, ppl=4.44, wps=365428, ups=0.75, wpb=489710, bsz=16672.4, num_updates=17400, lr=0.000479463, gnorm=0.266, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=23701 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 485 / 1707 loss=3.692, nll_loss=2.152, ppl=4.45, wps=366810, ups=0.75, wpb=490507, bsz=16386, num_updates=17500, lr=0.000478091, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=23835 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 585 / 1707 loss=3.696, nll_loss=2.157, ppl=4.46, wps=367286, ups=0.75, wpb=490524, bsz=16296.4, num_updates=17600, lr=0.000476731, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=23969 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 685 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366360, ups=0.75, wpb=490707, bsz=16327.4, num_updates=17700, lr=0.000475383, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24103 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 786 / 1707 loss=3.693, nll_loss=2.154, ppl=4.45, wps=364220, ups=0.74, wpb=490740, bsz=16464.1, num_updates=17800, lr=0.000474045, gnorm=0.275, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24237 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 886 / 1707 loss=3.691, nll_loss=2.152, ppl=4.44, wps=365602, ups=0.75, wpb=489367, bsz=16177.7, num_updates=17900, lr=0.000472719, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=24371 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 epoch 011: 986 / 1707 loss=3.694, nll_loss=2.156, ppl=4.46, wps=366008, ups=0.75, wpb=489997, bsz=16131.8, num_updates=18000, lr=0.000471405, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=24505 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011 | valid on 'valid' subset | loss 3.819 | nll_loss 2.27 | ppl 4.82 | wps 220078 | wpb 22263 | bsz 1004 | num_updates 18000 | best_loss 3.819 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1087 / 1707 loss=3.699, nll_loss=2.161, ppl=4.47, wps=323066, ups=0.66, wpb=489840, bsz=16249.6, num_updates=18100, lr=0.0004701, gnorm=0.281, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=24657 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1187 / 1707 loss=3.701, nll_loss=2.163, ppl=4.48, wps=367067, ups=0.75, wpb=490997, bsz=16630.5, num_updates=18200, lr=0.000468807, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=24790 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1287 / 1707 loss=3.698, nll_loss=2.16, ppl=4.47, wps=367799, ups=0.75, wpb=490955, bsz=16517.9, num_updates=18300, lr=0.000467525, gnorm=0.264, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=24924 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1388 / 1707 loss=3.698, nll_loss=2.159, ppl=4.47, wps=361986, ups=0.74, wpb=489368, bsz=16150.4, num_updates=18400, lr=0.000466252, gnorm=0.272, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=25059 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1488 / 1707 loss=3.697, nll_loss=2.159, ppl=4.47, wps=366964, ups=0.75, wpb=490217, bsz=16369, num_updates=18500, lr=0.000464991, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25193 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1588 / 1707 loss=3.691, nll_loss=2.152, ppl=4.45, wps=366811, ups=0.75, wpb=490140, bsz=16107.7, num_updates=18600, lr=0.000463739, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=25326 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 epoch 011: 1689 / 1707 loss=3.702, nll_loss=2.165, ppl=4.48, wps=363680, ups=0.74, wpb=489519, bsz=16264.7, num_updates=18700, lr=0.000462497, gnorm=0.274, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=25461 end of epoch 11 (average epoch stats below) epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 epoch 011 | loss 3.694 | nll_loss 2.155 | ppl 4.45 | wps 362773 | ups 0.74 | wpb 489885 | bsz 16331.8 | num_updates 18718 | lr 0.000462275 | gnorm 0.267 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.1 | wall 25484 Start iterating over samples epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 82 / 1707 loss=3.666, nll_loss=2.124, ppl=4.36, wps=362451, ups=0.75, wpb=485711, bsz=16062.8, num_updates=18800, lr=0.000461266, gnorm=0.267, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=25595 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 182 / 1707 loss=3.664, nll_loss=2.121, ppl=4.35, wps=367679, ups=0.75, wpb=490148, bsz=16228.3, num_updates=18900, lr=0.000460044, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=25728 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 epoch 012: 283 / 1707 loss=3.672, nll_loss=2.131, ppl=4.38, wps=363376, ups=0.74, wpb=490338, bsz=16582.3, num_updates=19000, lr=0.000458831, gnorm=0.267, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=25863 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.83 | nll_loss 2.281 | ppl 4.86 | wps 218749 | wpb 22263 | bsz 1004 | num_updates 19000 | best_loss 3.819 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 383 / 1707 loss=3.676, nll_loss=2.135, ppl=4.39, wps=337212, ups=0.69, wpb=489343, bsz=16223.4, num_updates=19100, lr=0.000457629, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26008 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 483 / 1707 loss=3.669, nll_loss=2.128, ppl=4.37, wps=368206, ups=0.75, wpb=490719, bsz=16203, num_updates=19200, lr=0.000456435, gnorm=0.263, clip=0, loss_scale=8, train_wall=133, gb_free=60.5, wall=26142 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 584 / 1707 loss=3.68, nll_loss=2.14, ppl=4.41, wps=362894, ups=0.74, wpb=488961, bsz=16370.7, num_updates=19300, lr=0.000455251, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=26276 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 684 / 1707 loss=3.678, nll_loss=2.138, ppl=4.4, wps=365535, ups=0.75, wpb=489717, bsz=16264.2, num_updates=19400, lr=0.000454077, gnorm=0.265, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=26410 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 785 / 1707 loss=3.671, nll_loss=2.13, ppl=4.38, wps=362314, ups=0.74, wpb=490643, bsz=16109.2, num_updates=19500, lr=0.000452911, gnorm=0.273, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=26546 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 885 / 1707 loss=3.675, nll_loss=2.134, ppl=4.39, wps=367299, ups=0.75, wpb=491424, bsz=16479.9, num_updates=19600, lr=0.000451754, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=26680 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 985 / 1707 loss=3.676, nll_loss=2.136, ppl=4.4, wps=368176, ups=0.75, wpb=489772, bsz=16359.4, num_updates=19700, lr=0.000450606, gnorm=0.262, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=26813 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1085 / 1707 loss=3.677, nll_loss=2.137, ppl=4.4, wps=368223, ups=0.75, wpb=490803, bsz=16401.8, num_updates=19800, lr=0.000449467, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=26946 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1185 / 1707 loss=3.686, nll_loss=2.147, ppl=4.43, wps=366284, ups=0.75, wpb=489144, bsz=16265.2, num_updates=19900, lr=0.000448336, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=27079 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 epoch 012: 1286 / 1707 loss=3.677, nll_loss=2.138, ppl=4.4, wps=360711, ups=0.74, wpb=489328, bsz=16509.5, num_updates=20000, lr=0.000447214, gnorm=0.256, clip=0, loss_scale=4, train_wall=135, gb_free=60.5, wall=27215 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012 | valid on 'valid' subset | loss 3.826 | nll_loss 2.282 | ppl 4.86 | wps 218568 | wpb 22263 | bsz 1004 | num_updates 20000 | best_loss 3.819 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1387 / 1707 loss=3.683, nll_loss=2.143, ppl=4.42, wps=332876, ups=0.68, wpb=490020, bsz=16370.2, num_updates=20100, lr=0.0004461, gnorm=0.262, clip=0, loss_scale=2, train_wall=134, gb_free=61.1, wall=27362 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1487 / 1707 loss=3.681, nll_loss=2.142, ppl=4.41, wps=367531, ups=0.75, wpb=491042, bsz=16437.6, num_updates=20200, lr=0.000444994, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=27496 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1587 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=366346, ups=0.75, wpb=490049, bsz=16405.8, num_updates=20300, lr=0.000443897, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=27630 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 epoch 012: 1687 / 1707 loss=3.683, nll_loss=2.144, ppl=4.42, wps=367853, ups=0.75, wpb=490753, bsz=16416.5, num_updates=20400, lr=0.000442807, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=27763 end of epoch 12 (average epoch stats below) epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 epoch 012 | loss 3.676 | nll_loss 2.136 | ppl 4.39 | wps 361829 | ups 0.74 | wpb 489899 | bsz 16333.9 | num_updates 20420 | lr 0.000442591 | gnorm 0.262 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 61.4 | wall 27789 Start iterating over samples epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 80 / 1707 loss=3.656, nll_loss=2.112, ppl=4.32, wps=366335, ups=0.75, wpb=487208, bsz=16412.9, num_updates=20500, lr=0.000441726, gnorm=0.277, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=27896 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 181 / 1707 loss=3.655, nll_loss=2.112, ppl=4.32, wps=362631, ups=0.74, wpb=490535, bsz=16397.8, num_updates=20600, lr=0.000440653, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=28031 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 281 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=366851, ups=0.75, wpb=489485, bsz=16291.4, num_updates=20700, lr=0.000439587, gnorm=0.264, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=28165 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 381 / 1707 loss=3.651, nll_loss=2.108, ppl=4.31, wps=367186, ups=0.75, wpb=490612, bsz=16404.2, num_updates=20800, lr=0.000438529, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=28298 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 481 / 1707 loss=3.66, nll_loss=2.118, ppl=4.34, wps=365865, ups=0.75, wpb=488969, bsz=16299.1, num_updates=20900, lr=0.000437479, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=28432 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 epoch 013: 582 / 1707 loss=3.656, nll_loss=2.113, ppl=4.33, wps=360974, ups=0.74, wpb=489254, bsz=16385, num_updates=21000, lr=0.000436436, gnorm=0.257, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=28568 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.803 | nll_loss 2.255 | ppl 4.77 | wps 218969 | wpb 22263 | bsz 1004 | num_updates 21000 | best_loss 3.803 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 683 / 1707 loss=3.662, nll_loss=2.12, ppl=4.35, wps=322574, ups=0.66, wpb=489950, bsz=16257.3, num_updates=21100, lr=0.0004354, gnorm=0.268, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=28719 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 783 / 1707 loss=3.663, nll_loss=2.121, ppl=4.35, wps=367635, ups=0.75, wpb=491036, bsz=16345.2, num_updates=21200, lr=0.000434372, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=28853 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 883 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367087, ups=0.75, wpb=490982, bsz=16242.1, num_updates=21300, lr=0.000433351, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=28987 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 983 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365512, ups=0.75, wpb=489883, bsz=16442.1, num_updates=21400, lr=0.000432338, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=61, wall=29121 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1084 / 1707 loss=3.665, nll_loss=2.123, ppl=4.36, wps=362541, ups=0.74, wpb=488585, bsz=16190.4, num_updates=21500, lr=0.000431331, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=29256 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1184 / 1707 loss=3.658, nll_loss=2.116, ppl=4.34, wps=367840, ups=0.75, wpb=490574, bsz=16649.8, num_updates=21600, lr=0.000430331, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=29389 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1284 / 1707 loss=3.667, nll_loss=2.127, ppl=4.37, wps=366571, ups=0.75, wpb=489916, bsz=16267, num_updates=21700, lr=0.000429339, gnorm=0.269, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=29523 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1384 / 1707 loss=3.665, nll_loss=2.124, ppl=4.36, wps=365229, ups=0.75, wpb=489576, bsz=16207.1, num_updates=21800, lr=0.000428353, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=29657 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1484 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=366392, ups=0.75, wpb=490176, bsz=16107, num_updates=21900, lr=0.000427374, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=29790 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 epoch 013: 1584 / 1707 loss=3.662, nll_loss=2.121, ppl=4.35, wps=367118, ups=0.75, wpb=490462, bsz=16453.6, num_updates=22000, lr=0.000426401, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=29924 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013 | valid on 'valid' subset | loss 3.816 | nll_loss 2.271 | ppl 4.83 | wps 221685 | wpb 22263 | bsz 1004 | num_updates 22000 | best_loss 3.803 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 epoch 013: 1685 / 1707 loss=3.663, nll_loss=2.122, ppl=4.35, wps=332755, ups=0.68, wpb=490888, bsz=16340.9, num_updates=22100, lr=0.000425436, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30072 end of epoch 13 (average epoch stats below) epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 epoch 013 | loss 3.66 | nll_loss 2.118 | ppl 4.34 | wps 360733 | ups 0.74 | wpb 489892 | bsz 16334.9 | num_updates 22122 | lr 0.000425224 | gnorm 0.261 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.3 | wall 30100 Start iterating over samples epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 78 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=365848, ups=0.75, wpb=486641, bsz=16212.8, num_updates=22200, lr=0.000424476, gnorm=0.262, clip=0, loss_scale=2, train_wall=132, gb_free=60.5, wall=30205 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 178 / 1707 loss=3.638, nll_loss=2.093, ppl=4.27, wps=363808, ups=0.74, wpb=489452, bsz=16355.8, num_updates=22300, lr=0.000423524, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=30339 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 278 / 1707 loss=3.635, nll_loss=2.089, ppl=4.26, wps=367011, ups=0.75, wpb=490559, bsz=16236.5, num_updates=22400, lr=0.000422577, gnorm=0.263, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=30473 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 379 / 1707 loss=3.643, nll_loss=2.099, ppl=4.28, wps=362826, ups=0.74, wpb=490501, bsz=16425.4, num_updates=22500, lr=0.000421637, gnorm=0.258, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=30608 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 480 / 1707 loss=3.641, nll_loss=2.097, ppl=4.28, wps=363839, ups=0.74, wpb=489902, bsz=16152.2, num_updates=22600, lr=0.000420703, gnorm=0.262, clip=0, loss_scale=1, train_wall=134, gb_free=60.9, wall=30743 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 580 / 1707 loss=3.644, nll_loss=2.1, ppl=4.29, wps=366254, ups=0.75, wpb=490460, bsz=16269.8, num_updates=22700, lr=0.000419775, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=30877 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 680 / 1707 loss=3.645, nll_loss=2.102, ppl=4.29, wps=367424, ups=0.75, wpb=489903, bsz=16311.1, num_updates=22800, lr=0.000418854, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=31010 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 780 / 1707 loss=3.649, nll_loss=2.106, ppl=4.3, wps=365648, ups=0.75, wpb=489935, bsz=16428.6, num_updates=22900, lr=0.000417938, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=31144 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 epoch 014: 880 / 1707 loss=3.648, nll_loss=2.105, ppl=4.3, wps=366041, ups=0.75, wpb=490686, bsz=16546.5, num_updates=23000, lr=0.000417029, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=31278 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014 | valid on 'valid' subset | loss 3.804 | nll_loss 2.256 | ppl 4.78 | wps 217354 | wpb 22263 | bsz 1004 | num_updates 23000 | best_loss 3.803 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 981 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=334757, ups=0.68, wpb=490435, bsz=16410.8, num_updates=23100, lr=0.000416125, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=61.3, wall=31424 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1081 / 1707 loss=3.65, nll_loss=2.108, ppl=4.31, wps=366541, ups=0.75, wpb=489556, bsz=16222.2, num_updates=23200, lr=0.000415227, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=31558 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1181 / 1707 loss=3.651, nll_loss=2.109, ppl=4.31, wps=366044, ups=0.75, wpb=490569, bsz=16639.3, num_updates=23300, lr=0.000414335, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=31692 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1281 / 1707 loss=3.647, nll_loss=2.104, ppl=4.3, wps=366723, ups=0.75, wpb=489979, bsz=16189.4, num_updates=23400, lr=0.000413449, gnorm=0.256, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=31826 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1382 / 1707 loss=3.652, nll_loss=2.11, ppl=4.32, wps=362164, ups=0.74, wpb=489871, bsz=16458.9, num_updates=23500, lr=0.000412568, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=31961 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1482 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=365628, ups=0.75, wpb=489705, bsz=16368.1, num_updates=23600, lr=0.000411693, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.2, wall=32095 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1582 / 1707 loss=3.653, nll_loss=2.111, ppl=4.32, wps=366395, ups=0.75, wpb=490530, bsz=16177.4, num_updates=23700, lr=0.000410824, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32229 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 epoch 014: 1682 / 1707 loss=3.65, nll_loss=2.107, ppl=4.31, wps=365547, ups=0.75, wpb=490155, bsz=16382.7, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=32363 end of epoch 14 (average epoch stats below) epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 epoch 014 | loss 3.646 | nll_loss 2.103 | ppl 4.3 | wps 363288 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 23824 | lr 0.000409753 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.3 | wall 32395 Start iterating over samples epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 76 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=362141, ups=0.74, wpb=486753, bsz=16316.3, num_updates=23900, lr=0.000409101, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=32497 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 epoch 015: 176 / 1707 loss=3.617, nll_loss=2.07, ppl=4.2, wps=364848, ups=0.75, wpb=489411, bsz=16218.6, num_updates=24000, lr=0.000408248, gnorm=0.252, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=32631 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015 | valid on 'valid' subset | loss 3.8 | nll_loss 2.253 | ppl 4.77 | wps 218379 | wpb 22263 | bsz 1004 | num_updates 24000 | best_loss 3.8 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 276 / 1707 loss=3.625, nll_loss=2.079, ppl=4.22, wps=324992, ups=0.66, wpb=489782, bsz=16238.3, num_updates=24100, lr=0.0004074, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=32782 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 376 / 1707 loss=3.628, nll_loss=2.082, ppl=4.23, wps=367587, ups=0.75, wpb=491117, bsz=16463.1, num_updates=24200, lr=0.000406558, gnorm=0.257, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=32916 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 476 / 1707 loss=3.629, nll_loss=2.084, ppl=4.24, wps=364606, ups=0.74, wpb=489708, bsz=16351.8, num_updates=24300, lr=0.00040572, gnorm=0.263, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=33050 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 578 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=360880, ups=0.74, wpb=490156, bsz=16035.1, num_updates=24400, lr=0.000404888, gnorm=0.262, clip=0, loss_scale=2, train_wall=135, gb_free=60.3, wall=33186 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 678 / 1707 loss=3.628, nll_loss=2.083, ppl=4.24, wps=365458, ups=0.75, wpb=489355, bsz=16375.8, num_updates=24500, lr=0.000404061, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33320 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 778 / 1707 loss=3.634, nll_loss=2.089, ppl=4.26, wps=368636, ups=0.75, wpb=491496, bsz=16540.8, num_updates=24600, lr=0.000403239, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=33453 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 879 / 1707 loss=3.639, nll_loss=2.096, ppl=4.27, wps=363263, ups=0.74, wpb=489998, bsz=16290.4, num_updates=24700, lr=0.000402422, gnorm=0.253, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=33588 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 979 / 1707 loss=3.634, nll_loss=2.09, ppl=4.26, wps=365437, ups=0.75, wpb=489182, bsz=16436.6, num_updates=24800, lr=0.00040161, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.3, wall=33722 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1079 / 1707 loss=3.638, nll_loss=2.094, ppl=4.27, wps=365602, ups=0.75, wpb=489613, bsz=16235.8, num_updates=24900, lr=0.000400802, gnorm=0.263, clip=0, loss_scale=2, train_wall=133, gb_free=61.2, wall=33856 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 epoch 015: 1179 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=366800, ups=0.75, wpb=489835, bsz=16253.8, num_updates=25000, lr=0.0004, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=33989 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015 | valid on 'valid' subset | loss 3.786 | nll_loss 2.239 | ppl 4.72 | wps 221602 | wpb 22263 | bsz 1004 | num_updates 25000 | best_loss 3.786 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1280 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323962, ups=0.66, wpb=491099, bsz=16308.3, num_updates=25100, lr=0.000399202, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=34141 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1380 / 1707 loss=3.642, nll_loss=2.099, ppl=4.28, wps=368010, ups=0.75, wpb=490217, bsz=16511.3, num_updates=25200, lr=0.00039841, gnorm=0.258, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=34274 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1481 / 1707 loss=3.641, nll_loss=2.098, ppl=4.28, wps=361542, ups=0.74, wpb=489140, bsz=16486.8, num_updates=25300, lr=0.000397621, gnorm=0.261, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=34409 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1581 / 1707 loss=3.642, nll_loss=2.1, ppl=4.29, wps=367680, ups=0.75, wpb=490958, bsz=16333.1, num_updates=25400, lr=0.000396838, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=34543 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 epoch 015: 1681 / 1707 loss=3.637, nll_loss=2.093, ppl=4.27, wps=366954, ups=0.75, wpb=490397, bsz=16178.6, num_updates=25500, lr=0.000396059, gnorm=0.252, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=34677 end of epoch 15 (average epoch stats below) epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 epoch 015 | loss 3.633 | nll_loss 2.089 | ppl 4.25 | wps 360168 | ups 0.74 | wpb 489894 | bsz 16333.6 | num_updates 25526 | lr 0.000395857 | gnorm 0.257 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.8 | wall 34710 Start iterating over samples epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 75 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=360781, ups=0.74, wpb=485250, bsz=16464.4, num_updates=25600, lr=0.000395285, gnorm=0.26, clip=0, loss_scale=2, train_wall=134, gb_free=60.2, wall=34811 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 175 / 1707 loss=3.606, nll_loss=2.057, ppl=4.16, wps=366888, ups=0.75, wpb=490693, bsz=16561.9, num_updates=25700, lr=0.000394515, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=34945 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 276 / 1707 loss=3.612, nll_loss=2.064, ppl=4.18, wps=361822, ups=0.74, wpb=490627, bsz=16170, num_updates=25800, lr=0.00039375, gnorm=0.253, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=35080 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 376 / 1707 loss=3.619, nll_loss=2.073, ppl=4.21, wps=367320, ups=0.75, wpb=490091, bsz=16224.2, num_updates=25900, lr=0.000392989, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=35214 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 epoch 016: 476 / 1707 loss=3.616, nll_loss=2.069, ppl=4.2, wps=366919, ups=0.75, wpb=489362, bsz=16069.8, num_updates=26000, lr=0.000392232, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=35347 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016 | valid on 'valid' subset | loss 3.807 | nll_loss 2.262 | ppl 4.8 | wps 218920 | wpb 22263 | bsz 1004 | num_updates 26000 | best_loss 3.786 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 577 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=333024, ups=0.68, wpb=491066, bsz=16338.9, num_updates=26100, lr=0.00039148, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=35495 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 677 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=367988, ups=0.75, wpb=490853, bsz=16306.6, num_updates=26200, lr=0.000390732, gnorm=0.255, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=35628 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 777 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=370957, ups=0.75, wpb=492311, bsz=16170.2, num_updates=26300, lr=0.000389989, gnorm=0.251, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=35761 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 877 / 1707 loss=3.621, nll_loss=2.076, ppl=4.22, wps=366177, ups=0.75, wpb=490368, bsz=16564.3, num_updates=26400, lr=0.000389249, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=35895 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 977 / 1707 loss=3.624, nll_loss=2.079, ppl=4.22, wps=365757, ups=0.75, wpb=489594, bsz=16319.8, num_updates=26500, lr=0.000388514, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=36029 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1077 / 1707 loss=3.63, nll_loss=2.085, ppl=4.24, wps=366137, ups=0.75, wpb=489015, bsz=16369.6, num_updates=26600, lr=0.000387783, gnorm=0.266, clip=0, loss_scale=4, train_wall=133, gb_free=61, wall=36162 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1177 / 1707 loss=3.63, nll_loss=2.086, ppl=4.24, wps=365780, ups=0.75, wpb=490453, bsz=16566.5, num_updates=26700, lr=0.000387056, gnorm=0.259, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=36296 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1277 / 1707 loss=3.628, nll_loss=2.084, ppl=4.24, wps=368787, ups=0.75, wpb=490863, bsz=16457, num_updates=26800, lr=0.000386334, gnorm=0.246, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=36429 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1378 / 1707 loss=3.621, nll_loss=2.075, ppl=4.21, wps=362743, ups=0.74, wpb=489667, bsz=15976.2, num_updates=26900, lr=0.000385615, gnorm=0.255, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=36564 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 epoch 016: 1479 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=363312, ups=0.74, wpb=489594, bsz=16566, num_updates=27000, lr=0.0003849, gnorm=0.261, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=36699 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016 | valid on 'valid' subset | loss 3.783 | nll_loss 2.236 | ppl 4.71 | wps 217783 | wpb 22263 | bsz 1004 | num_updates 27000 | best_loss 3.783 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1579 / 1707 loss=3.632, nll_loss=2.088, ppl=4.25, wps=323893, ups=0.66, wpb=488423, bsz=16264.4, num_updates=27100, lr=0.000384189, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=36850 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 epoch 016: 1679 / 1707 loss=3.631, nll_loss=2.087, ppl=4.25, wps=367249, ups=0.75, wpb=489928, bsz=16316.8, num_updates=27200, lr=0.000383482, gnorm=0.248, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=36983 end of epoch 16 (average epoch stats below) epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 epoch 016 | loss 3.621 | nll_loss 2.076 | ppl 4.22 | wps 361074 | ups 0.74 | wpb 489905 | bsz 16331.7 | num_updates 27228 | lr 0.000383285 | gnorm 0.255 | clip 0 | loss_scale 4 | train_wall 2270 | gb_free 61.3 | wall 37019 Start iterating over samples epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 73 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=363121, ups=0.75, wpb=486559, bsz=16218.6, num_updates=27300, lr=0.00038278, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=37117 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 173 / 1707 loss=3.598, nll_loss=2.049, ppl=4.14, wps=365378, ups=0.75, wpb=489699, bsz=16506.6, num_updates=27400, lr=0.00038208, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=37251 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 273 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=366149, ups=0.75, wpb=488872, bsz=16368.6, num_updates=27500, lr=0.000381385, gnorm=0.249, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=37385 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 374 / 1707 loss=3.601, nll_loss=2.053, ppl=4.15, wps=362623, ups=0.74, wpb=490825, bsz=16443.6, num_updates=27600, lr=0.000380693, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=37520 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 474 / 1707 loss=3.605, nll_loss=2.057, ppl=4.16, wps=365586, ups=0.75, wpb=489822, bsz=16363.1, num_updates=27700, lr=0.000380006, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=37654 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 574 / 1707 loss=3.61, nll_loss=2.063, ppl=4.18, wps=366191, ups=0.75, wpb=489980, bsz=16234.3, num_updates=27800, lr=0.000379322, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=37788 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 675 / 1707 loss=3.607, nll_loss=2.059, ppl=4.17, wps=362478, ups=0.74, wpb=489760, bsz=16341.6, num_updates=27900, lr=0.000378641, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.1, wall=37923 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 epoch 017: 775 / 1707 loss=3.612, nll_loss=2.065, ppl=4.18, wps=368265, ups=0.75, wpb=490667, bsz=16115.5, num_updates=28000, lr=0.000377964, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=38056 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017 | valid on 'valid' subset | loss 3.792 | nll_loss 2.245 | ppl 4.74 | wps 222309 | wpb 22263 | bsz 1004 | num_updates 28000 | best_loss 3.783 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 875 / 1707 loss=3.617, nll_loss=2.071, ppl=4.2, wps=322489, ups=0.66, wpb=488795, bsz=16341.7, num_updates=28100, lr=0.000377291, gnorm=0.255, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38208 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 976 / 1707 loss=3.615, nll_loss=2.068, ppl=4.19, wps=361371, ups=0.74, wpb=489695, bsz=16391.5, num_updates=28200, lr=0.000376622, gnorm=0.256, clip=0, loss_scale=2, train_wall=135, gb_free=60.8, wall=38343 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1076 / 1707 loss=3.622, nll_loss=2.077, ppl=4.22, wps=368274, ups=0.75, wpb=490289, bsz=16350.6, num_updates=28300, lr=0.000375956, gnorm=0.263, clip=1, loss_scale=2, train_wall=133, gb_free=60.4, wall=38476 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1176 / 1707 loss=3.612, nll_loss=2.066, ppl=4.19, wps=367854, ups=0.75, wpb=491326, bsz=16215.9, num_updates=28400, lr=0.000375293, gnorm=0.254, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=38610 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1276 / 1707 loss=3.616, nll_loss=2.07, ppl=4.2, wps=365603, ups=0.75, wpb=489551, bsz=16232.8, num_updates=28500, lr=0.000374634, gnorm=0.253, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=38744 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1376 / 1707 loss=3.613, nll_loss=2.067, ppl=4.19, wps=366115, ups=0.75, wpb=489927, bsz=16262.4, num_updates=28600, lr=0.000373979, gnorm=0.259, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=38878 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1478 / 1707 loss=3.616, nll_loss=2.071, ppl=4.2, wps=360142, ups=0.73, wpb=491159, bsz=16288.8, num_updates=28700, lr=0.000373327, gnorm=0.255, clip=0, loss_scale=2, train_wall=136, gb_free=60.5, wall=39014 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1579 / 1707 loss=3.618, nll_loss=2.072, ppl=4.21, wps=365177, ups=0.74, wpb=491106, bsz=16253.5, num_updates=28800, lr=0.000372678, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=39149 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 epoch 017: 1679 / 1707 loss=3.622, nll_loss=2.078, ppl=4.22, wps=366839, ups=0.75, wpb=490194, bsz=16637.3, num_updates=28900, lr=0.000372033, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=39282 end of epoch 17 (average epoch stats below) epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 epoch 017 | loss 3.611 | nll_loss 2.065 | ppl 4.18 | wps 362219 | ups 0.74 | wpb 489905 | bsz 16326.4 | num_updates 28928 | lr 0.000371853 | gnorm 0.254 | clip 0.1 | loss_scale 1 | train_wall 2272 | gb_free 61 | wall 39319 Start iterating over samples epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 epoch 018: 72 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=366270, ups=0.75, wpb=485823, bsz=16206.2, num_updates=29000, lr=0.000371391, gnorm=0.253, clip=0, loss_scale=1, train_wall=132, gb_free=60.7, wall=39415 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018 | valid on 'valid' subset | loss 3.786 | nll_loss 2.236 | ppl 4.71 | wps 221571 | wpb 22263 | bsz 1004 | num_updates 29000 | best_loss 3.783 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 172 / 1707 loss=3.59, nll_loss=2.04, ppl=4.11, wps=322235, ups=0.66, wpb=490151, bsz=16249, num_updates=29100, lr=0.000370752, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=39567 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 272 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=366140, ups=0.75, wpb=490671, bsz=16599.8, num_updates=29200, lr=0.000370117, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=39701 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 372 / 1707 loss=3.594, nll_loss=2.045, ppl=4.13, wps=368376, ups=0.75, wpb=491118, bsz=16270.2, num_updates=29300, lr=0.000369484, gnorm=0.252, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=39834 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 472 / 1707 loss=3.6, nll_loss=2.052, ppl=4.15, wps=367164, ups=0.75, wpb=489842, bsz=16236.5, num_updates=29400, lr=0.000368856, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=39968 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 573 / 1707 loss=3.596, nll_loss=2.047, ppl=4.13, wps=362091, ups=0.74, wpb=490021, bsz=16360.2, num_updates=29500, lr=0.00036823, gnorm=0.252, clip=0, loss_scale=2, train_wall=135, gb_free=60.2, wall=40103 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 674 / 1707 loss=3.603, nll_loss=2.055, ppl=4.16, wps=361046, ups=0.74, wpb=488532, bsz=16227.1, num_updates=29600, lr=0.000367607, gnorm=0.25, clip=0, loss_scale=1, train_wall=135, gb_free=61, wall=40238 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 774 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=367726, ups=0.75, wpb=489548, bsz=16428.1, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=61.1, wall=40372 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 874 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=367757, ups=0.75, wpb=490942, bsz=16473.4, num_updates=29800, lr=0.000366372, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=40505 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 974 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365941, ups=0.75, wpb=490681, bsz=16253.2, num_updates=29900, lr=0.000365758, gnorm=0.24, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=40639 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 epoch 018: 1074 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=365067, ups=0.74, wpb=490087, bsz=16343.4, num_updates=30000, lr=0.000365148, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=40773 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018 | valid on 'valid' subset | loss 3.782 | nll_loss 2.231 | ppl 4.69 | wps 216538 | wpb 22263 | bsz 1004 | num_updates 30000 | best_loss 3.782 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1175 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=311891, ups=0.64, wpb=490707, bsz=16260.3, num_updates=30100, lr=0.000364541, gnorm=0.254, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=40931 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1275 / 1707 loss=3.608, nll_loss=2.061, ppl=4.17, wps=368039, ups=0.75, wpb=490570, bsz=16487.8, num_updates=30200, lr=0.000363937, gnorm=0.249, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=41064 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1375 / 1707 loss=3.611, nll_loss=2.065, ppl=4.18, wps=366293, ups=0.75, wpb=489458, bsz=16353.7, num_updates=30300, lr=0.000363336, gnorm=0.257, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=41198 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1476 / 1707 loss=3.607, nll_loss=2.06, ppl=4.17, wps=363573, ups=0.74, wpb=490994, bsz=16152.8, num_updates=30400, lr=0.000362738, gnorm=0.245, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=41333 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1576 / 1707 loss=3.608, nll_loss=2.062, ppl=4.18, wps=367233, ups=0.75, wpb=489653, bsz=16181.4, num_updates=30500, lr=0.000362143, gnorm=0.251, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=41466 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 epoch 018: 1677 / 1707 loss=3.609, nll_loss=2.063, ppl=4.18, wps=363023, ups=0.74, wpb=489600, bsz=16404.9, num_updates=30600, lr=0.000361551, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=41601 end of epoch 18 (average epoch stats below) epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 epoch 018 | loss 3.602 | nll_loss 2.054 | ppl 4.15 | wps 359216 | ups 0.73 | wpb 489889 | bsz 16330.3 | num_updates 30630 | lr 0.000361374 | gnorm 0.251 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.3 | wall 41640 Start iterating over samples epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 70 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=364469, ups=0.75, wpb=486187, bsz=16221.9, num_updates=30700, lr=0.000360961, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=41734 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 170 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=367114, ups=0.75, wpb=490783, bsz=16493.8, num_updates=30800, lr=0.000360375, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=41868 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=366923, ups=0.75, wpb=490604, bsz=16485.6, num_updates=30900, lr=0.000359791, gnorm=0.254, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=42002 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 epoch 019: 370 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=365916, ups=0.75, wpb=489890, bsz=16378.7, num_updates=31000, lr=0.000359211, gnorm=0.253, clip=0, loss_scale=2, train_wall=133, gb_free=60.1, wall=42136 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019 | valid on 'valid' subset | loss 3.788 | nll_loss 2.241 | ppl 4.73 | wps 221222 | wpb 22263 | bsz 1004 | num_updates 31000 | best_loss 3.782 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 470 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=321894, ups=0.66, wpb=489347, bsz=16550.7, num_updates=31100, lr=0.000358633, gnorm=0.258, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=42288 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 571 / 1707 loss=3.591, nll_loss=2.043, ppl=4.12, wps=363655, ups=0.74, wpb=490343, bsz=16413.5, num_updates=31200, lr=0.000358057, gnorm=0.25, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=42422 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 672 / 1707 loss=3.592, nll_loss=2.043, ppl=4.12, wps=364008, ups=0.74, wpb=490806, bsz=16358.8, num_updates=31300, lr=0.000357485, gnorm=0.25, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=42557 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 772 / 1707 loss=3.592, nll_loss=2.044, ppl=4.12, wps=366214, ups=0.75, wpb=489468, bsz=16231.9, num_updates=31400, lr=0.000356915, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=42691 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 872 / 1707 loss=3.596, nll_loss=2.049, ppl=4.14, wps=367901, ups=0.75, wpb=490977, bsz=16604.6, num_updates=31500, lr=0.000356348, gnorm=0.259, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=42824 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 973 / 1707 loss=3.598, nll_loss=2.05, ppl=4.14, wps=362945, ups=0.74, wpb=490099, bsz=16206.2, num_updates=31600, lr=0.000355784, gnorm=0.241, clip=0, loss_scale=1, train_wall=135, gb_free=60.6, wall=42959 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1073 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=366934, ups=0.75, wpb=490084, bsz=16297.8, num_updates=31700, lr=0.000355222, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43093 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1173 / 1707 loss=3.604, nll_loss=2.057, ppl=4.16, wps=367147, ups=0.75, wpb=490613, bsz=16323.4, num_updates=31800, lr=0.000354663, gnorm=0.254, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=43227 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1275 / 1707 loss=3.603, nll_loss=2.056, ppl=4.16, wps=359993, ups=0.74, wpb=489439, bsz=16481.8, num_updates=31900, lr=0.000354107, gnorm=0.254, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=43363 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 epoch 019: 1375 / 1707 loss=3.599, nll_loss=2.052, ppl=4.15, wps=366556, ups=0.75, wpb=490363, bsz=16251.4, num_updates=32000, lr=0.000353553, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=43496 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019 | valid on 'valid' subset | loss 3.763 | nll_loss 2.211 | ppl 4.63 | wps 220276 | wpb 22263 | bsz 1004 | num_updates 32000 | best_loss 3.763 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1475 / 1707 loss=3.598, nll_loss=2.051, ppl=4.14, wps=322957, ups=0.66, wpb=489768, bsz=16337.8, num_updates=32100, lr=0.000353002, gnorm=0.263, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=43648 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1575 / 1707 loss=3.605, nll_loss=2.058, ppl=4.16, wps=368389, ups=0.75, wpb=490065, bsz=16073.7, num_updates=32200, lr=0.000352454, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=43781 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 epoch 019: 1675 / 1707 loss=3.601, nll_loss=2.054, ppl=4.15, wps=367305, ups=0.75, wpb=489208, bsz=16126.2, num_updates=32300, lr=0.000351908, gnorm=0.256, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=43914 end of epoch 19 (average epoch stats below) epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 epoch 019 | loss 3.594 | nll_loss 2.045 | ppl 4.13 | wps 359961 | ups 0.73 | wpb 489894 | bsz 16331.9 | num_updates 32332 | lr 0.000351733 | gnorm 0.254 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 61.4 | wall 43956 Start iterating over samples epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 68 / 1707 loss=3.578, nll_loss=2.028, ppl=4.08, wps=354737, ups=0.73, wpb=486828, bsz=16306.3, num_updates=32400, lr=0.000351364, gnorm=0.262, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=44051 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 169 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=364344, ups=0.74, wpb=490139, bsz=16152.6, num_updates=32500, lr=0.000350823, gnorm=0.256, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=44186 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 270 / 1707 loss=3.581, nll_loss=2.031, ppl=4.09, wps=363037, ups=0.74, wpb=489238, bsz=16118.8, num_updates=32600, lr=0.000350285, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=44321 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 370 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=367757, ups=0.75, wpb=490069, bsz=16450.7, num_updates=32700, lr=0.000349749, gnorm=0.252, clip=0, loss_scale=0.5, train_wall=133, gb_free=61, wall=44454 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 470 / 1707 loss=3.579, nll_loss=2.028, ppl=4.08, wps=365994, ups=0.75, wpb=489746, bsz=16119.9, num_updates=32800, lr=0.000349215, gnorm=0.245, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=44588 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 570 / 1707 loss=3.582, nll_loss=2.032, ppl=4.09, wps=365407, ups=0.74, wpb=490726, bsz=16433.1, num_updates=32900, lr=0.000348684, gnorm=0.245, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=44722 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 epoch 020: 670 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=367419, ups=0.75, wpb=490956, bsz=16574.7, num_updates=33000, lr=0.000348155, gnorm=0.244, clip=0, loss_scale=1, train_wall=133, gb_free=60.9, wall=44856 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.767 | nll_loss 2.221 | ppl 4.66 | wps 218131 | wpb 22263 | bsz 1004 | num_updates 33000 | best_loss 3.763 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 770 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=337261, ups=0.69, wpb=490425, bsz=16280.8, num_updates=33100, lr=0.000347629, gnorm=0.255, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=45001 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 870 / 1707 loss=3.588, nll_loss=2.039, ppl=4.11, wps=367762, ups=0.75, wpb=490356, bsz=16416.1, num_updates=33200, lr=0.000347105, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=45135 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 970 / 1707 loss=3.586, nll_loss=2.036, ppl=4.1, wps=367555, ups=0.75, wpb=490147, bsz=16217.7, num_updates=33300, lr=0.000346583, gnorm=0.25, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=45268 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1071 / 1707 loss=3.597, nll_loss=2.049, ppl=4.14, wps=362060, ups=0.74, wpb=490491, bsz=16417.5, num_updates=33400, lr=0.000346064, gnorm=0.249, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=45403 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1171 / 1707 loss=3.59, nll_loss=2.042, ppl=4.12, wps=368827, ups=0.75, wpb=490005, bsz=16264.6, num_updates=33500, lr=0.000345547, gnorm=0.254, clip=0, loss_scale=2, train_wall=132, gb_free=61, wall=45536 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1271 / 1707 loss=3.59, nll_loss=2.041, ppl=4.12, wps=365858, ups=0.75, wpb=489889, bsz=16563.3, num_updates=33600, lr=0.000345033, gnorm=0.26, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=45670 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1372 / 1707 loss=3.595, nll_loss=2.048, ppl=4.13, wps=363325, ups=0.74, wpb=489414, bsz=16411.7, num_updates=33700, lr=0.00034452, gnorm=0.255, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=45805 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1472 / 1707 loss=3.591, nll_loss=2.042, ppl=4.12, wps=367114, ups=0.75, wpb=490610, bsz=16356.2, num_updates=33800, lr=0.00034401, gnorm=0.26, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=45938 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1572 / 1707 loss=3.593, nll_loss=2.045, ppl=4.13, wps=365398, ups=0.75, wpb=490146, bsz=16375.8, num_updates=33900, lr=0.000343503, gnorm=0.256, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=46073 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 epoch 020: 1673 / 1707 loss=3.596, nll_loss=2.048, ppl=4.14, wps=361262, ups=0.74, wpb=489478, bsz=16096.2, num_updates=34000, lr=0.000342997, gnorm=0.254, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=46208 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 epoch 020 | valid on 'valid' subset | loss 3.778 | nll_loss 2.231 | ppl 4.7 | wps 215114 | wpb 22263 | bsz 1004 | num_updates 34000 | best_loss 3.763 end of epoch 20 (average epoch stats below) epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 epoch 020 | loss 3.586 | nll_loss 2.037 | ppl 4.1 | wps 361142 | ups 0.74 | wpb 489891 | bsz 16333.2 | num_updates 34034 | lr 0.000342826 | gnorm 0.253 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 61.3 | wall 46265 Start iterating over samples epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 67 / 1707 loss=3.573, nll_loss=2.021, ppl=4.06, wps=331713, ups=0.68, wpb=485796, bsz=16277.9, num_updates=34100, lr=0.000342494, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=46355 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 168 / 1707 loss=3.569, nll_loss=2.017, ppl=4.05, wps=362350, ups=0.74, wpb=490633, bsz=16278.1, num_updates=34200, lr=0.000341993, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.6, wall=46490 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 268 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=365030, ups=0.75, wpb=488887, bsz=16319.4, num_updates=34300, lr=0.000341494, gnorm=0.273, clip=1, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=46624 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 368 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365573, ups=0.75, wpb=489747, bsz=16349, num_updates=34400, lr=0.000340997, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=46758 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 469 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=364844, ups=0.74, wpb=490295, bsz=16122, num_updates=34500, lr=0.000340503, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.8, wall=46892 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 569 / 1707 loss=3.574, nll_loss=2.023, ppl=4.06, wps=366458, ups=0.75, wpb=490003, bsz=16085.3, num_updates=34600, lr=0.00034001, gnorm=0.269, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=47026 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 669 / 1707 loss=3.574, nll_loss=2.024, ppl=4.07, wps=365446, ups=0.75, wpb=490520, bsz=16699, num_updates=34700, lr=0.00033952, gnorm=0.246, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=47160 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 769 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=368392, ups=0.75, wpb=491488, bsz=16313, num_updates=34800, lr=0.000339032, gnorm=0.253, clip=0, loss_scale=1, train_wall=133, gb_free=60.2, wall=47294 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 870 / 1707 loss=3.584, nll_loss=2.034, ppl=4.1, wps=363249, ups=0.74, wpb=490110, bsz=16332.2, num_updates=34900, lr=0.000338546, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47429 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 epoch 021: 970 / 1707 loss=3.584, nll_loss=2.035, ppl=4.1, wps=366297, ups=0.75, wpb=491124, bsz=16325.3, num_updates=35000, lr=0.000338062, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=47563 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021 | valid on 'valid' subset | loss 3.778 | nll_loss 2.233 | ppl 4.7 | wps 215782 | wpb 22263 | bsz 1004 | num_updates 35000 | best_loss 3.763 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1070 / 1707 loss=3.582, nll_loss=2.033, ppl=4.09, wps=320929, ups=0.65, wpb=490049, bsz=16555.6, num_updates=35100, lr=0.00033758, gnorm=0.252, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=47715 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1170 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=366053, ups=0.75, wpb=489109, bsz=16151.1, num_updates=35200, lr=0.0003371, gnorm=0.257, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=47849 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1271 / 1707 loss=3.58, nll_loss=2.031, ppl=4.09, wps=361335, ups=0.74, wpb=489490, bsz=16551.4, num_updates=35300, lr=0.000336622, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.4, wall=47984 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1371 / 1707 loss=3.589, nll_loss=2.04, ppl=4.11, wps=367537, ups=0.75, wpb=489654, bsz=16205, num_updates=35400, lr=0.000336146, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=48118 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1472 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=363528, ups=0.74, wpb=490042, bsz=16556.7, num_updates=35500, lr=0.000335673, gnorm=0.259, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.3, wall=48252 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1572 / 1707 loss=3.586, nll_loss=2.037, ppl=4.1, wps=364728, ups=0.74, wpb=490115, bsz=16385.5, num_updates=35600, lr=0.000335201, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=48387 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 epoch 021: 1672 / 1707 loss=3.585, nll_loss=2.037, ppl=4.1, wps=367741, ups=0.75, wpb=490422, bsz=16204, num_updates=35700, lr=0.000334731, gnorm=0.255, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=48520 end of epoch 21 (average epoch stats below) epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 epoch 021 | loss 3.579 | nll_loss 2.03 | ppl 4.08 | wps 362118 | ups 0.74 | wpb 489892 | bsz 16337.5 | num_updates 35735 | lr 0.000334567 | gnorm 0.255 | clip 0.1 | loss_scale 0.5 | train_wall 2273 | gb_free 62 | wall 48566 Start iterating over samples epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 65 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=364343, ups=0.75, wpb=485873, bsz=16300.6, num_updates=35800, lr=0.000334263, gnorm=0.24, clip=0, loss_scale=1, train_wall=133, gb_free=61.2, wall=48654 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 165 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367821, ups=0.75, wpb=490201, bsz=16206, num_updates=35900, lr=0.000333797, gnorm=0.259, clip=0, loss_scale=1, train_wall=133, gb_free=61.3, wall=48787 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 epoch 022: 266 / 1707 loss=3.56, nll_loss=2.007, ppl=4.02, wps=361576, ups=0.74, wpb=489937, bsz=16332.7, num_updates=36000, lr=0.000333333, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=135, gb_free=60.5, wall=48922 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022 | valid on 'valid' subset | loss 3.777 | nll_loss 2.232 | ppl 4.7 | wps 214301 | wpb 22263 | bsz 1004 | num_updates 36000 | best_loss 3.763 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 366 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=335337, ups=0.69, wpb=488686, bsz=16286, num_updates=36100, lr=0.000332871, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.8, wall=49068 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 466 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=368542, ups=0.75, wpb=490655, bsz=16197.4, num_updates=36200, lr=0.000332411, gnorm=0.258, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=49201 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 566 / 1707 loss=3.57, nll_loss=2.019, ppl=4.05, wps=366542, ups=0.75, wpb=490424, bsz=16090.5, num_updates=36300, lr=0.000331953, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49335 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=367644, ups=0.75, wpb=490697, bsz=16363.9, num_updates=36400, lr=0.000331497, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=49468 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 766 / 1707 loss=3.574, nll_loss=2.023, ppl=4.07, wps=365842, ups=0.75, wpb=489579, bsz=16336.2, num_updates=36500, lr=0.000331042, gnorm=0.247, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=49602 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 866 / 1707 loss=3.568, nll_loss=2.017, ppl=4.05, wps=366652, ups=0.75, wpb=491157, bsz=16520.8, num_updates=36600, lr=0.00033059, gnorm=0.246, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=49736 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=366183, ups=0.75, wpb=490001, bsz=16272, num_updates=36700, lr=0.000330139, gnorm=0.242, clip=0, loss_scale=2, train_wall=133, gb_free=61, wall=49870 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1067 / 1707 loss=3.578, nll_loss=2.029, ppl=4.08, wps=360146, ups=0.74, wpb=489251, bsz=16361.8, num_updates=36800, lr=0.00032969, gnorm=0.251, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=50006 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1169 / 1707 loss=3.576, nll_loss=2.026, ppl=4.07, wps=360345, ups=0.73, wpb=491690, bsz=16468.4, num_updates=36900, lr=0.000329243, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.5, wall=50142 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 epoch 022: 1270 / 1707 loss=3.58, nll_loss=2.03, ppl=4.09, wps=362511, ups=0.74, wpb=491165, bsz=16581.3, num_updates=37000, lr=0.000328798, gnorm=0.28, clip=0, loss_scale=0.25, train_wall=135, gb_free=60.5, wall=50278 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022 | valid on 'valid' subset | loss 3.758 | nll_loss 2.212 | ppl 4.63 | wps 97997.6 | wpb 22263 | bsz 1004 | num_updates 37000 | best_loss 3.758 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1370 / 1707 loss=3.586, nll_loss=2.038, ppl=4.11, wps=324680, ups=0.66, wpb=489120, bsz=16362.8, num_updates=37100, lr=0.000328355, gnorm=0.264, clip=0, loss_scale=0.25, train_wall=132, gb_free=60.3, wall=50428 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1470 / 1707 loss=3.583, nll_loss=2.034, ppl=4.1, wps=367308, ups=0.75, wpb=490328, bsz=16514.3, num_updates=37200, lr=0.000327913, gnorm=0.269, clip=0, loss_scale=0.25, train_wall=133, gb_free=60.4, wall=50562 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1570 / 1707 loss=3.585, nll_loss=2.036, ppl=4.1, wps=369205, ups=0.75, wpb=490176, bsz=16222.7, num_updates=37300, lr=0.000327473, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=132, gb_free=60.3, wall=50695 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 epoch 022: 1670 / 1707 loss=3.587, nll_loss=2.039, ppl=4.11, wps=367044, ups=0.75, wpb=489427, bsz=16136.3, num_updates=37400, lr=0.000327035, gnorm=0.256, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.6, wall=50828 end of epoch 22 (average epoch stats below) epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 epoch 022 | loss 3.573 | nll_loss 2.023 | ppl 4.06 | wps 360862 | ups 0.74 | wpb 489895 | bsz 16331.1 | num_updates 37437 | lr 0.000326873 | gnorm 0.256 | clip 0 | loss_scale 0.5 | train_wall 2272 | gb_free 61.1 | wall 50877 Start iterating over samples epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 64 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=360935, ups=0.74, wpb=486305, bsz=16183.3, num_updates=37500, lr=0.000326599, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.7, wall=50963 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 164 / 1707 loss=3.552, nll_loss=1.999, ppl=4, wps=366076, ups=0.75, wpb=489931, bsz=16352.8, num_updates=37600, lr=0.000326164, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.3, wall=51097 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 264 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=366385, ups=0.75, wpb=491024, bsz=16833, num_updates=37700, lr=0.000325731, gnorm=0.264, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51231 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 365 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=363946, ups=0.74, wpb=490684, bsz=16196.9, num_updates=37800, lr=0.0003253, gnorm=0.279, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.4, wall=51366 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 466 / 1707 loss=3.565, nll_loss=2.013, ppl=4.04, wps=362913, ups=0.74, wpb=491094, bsz=16288.4, num_updates=37900, lr=0.000324871, gnorm=0.264, clip=1, loss_scale=0.25, train_wall=135, gb_free=60.4, wall=51501 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 epoch 023: 566 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=367342, ups=0.75, wpb=489718, bsz=16542.8, num_updates=38000, lr=0.000324443, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.5, wall=51634 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.776 | nll_loss 2.226 | ppl 4.68 | wps 215405 | wpb 22263 | bsz 1004 | num_updates 38000 | best_loss 3.758 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 666 / 1707 loss=3.569, nll_loss=2.018, ppl=4.05, wps=335124, ups=0.68, wpb=489972, bsz=16426.6, num_updates=38100, lr=0.000324017, gnorm=0.308, clip=1, loss_scale=0.25, train_wall=133, gb_free=60.6, wall=51780 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 766 / 1707 loss=3.571, nll_loss=2.02, ppl=4.06, wps=364782, ups=0.74, wpb=489735, bsz=16454.5, num_updates=38200, lr=0.000323592, gnorm=0.3, clip=1, loss_scale=0.5, train_wall=134, gb_free=60.6, wall=51915 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 866 / 1707 loss=3.573, nll_loss=2.022, ppl=4.06, wps=365792, ups=0.75, wpb=490496, bsz=16813.2, num_updates=38300, lr=0.00032317, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52049 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 966 / 1707 loss=3.575, nll_loss=2.025, ppl=4.07, wps=365839, ups=0.75, wpb=490656, bsz=16163, num_updates=38400, lr=0.000322749, gnorm=0.261, clip=0, loss_scale=1, train_wall=134, gb_free=60.3, wall=52183 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1066 / 1707 loss=3.574, nll_loss=2.025, ppl=4.07, wps=367174, ups=0.75, wpb=490488, bsz=16480.1, num_updates=38500, lr=0.000322329, gnorm=0.246, clip=0, loss_scale=1, train_wall=133, gb_free=60.3, wall=52316 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1166 / 1707 loss=3.572, nll_loss=2.021, ppl=4.06, wps=365870, ups=0.75, wpb=490401, bsz=16280.9, num_updates=38600, lr=0.000321911, gnorm=0.252, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=52450 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1266 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=364532, ups=0.74, wpb=489556, bsz=16229.2, num_updates=38700, lr=0.000321495, gnorm=0.269, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=52585 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1368 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=358232, ups=0.73, wpb=488321, bsz=16052.3, num_updates=38800, lr=0.000321081, gnorm=0.258, clip=0, loss_scale=0.5, train_wall=136, gb_free=60.4, wall=52721 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1468 / 1707 loss=3.573, nll_loss=2.023, ppl=4.06, wps=367038, ups=0.75, wpb=489351, bsz=16272.9, num_updates=38900, lr=0.000320668, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=52854 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 epoch 023: 1568 / 1707 loss=3.577, nll_loss=2.027, ppl=4.08, wps=368779, ups=0.75, wpb=490358, bsz=15935.9, num_updates=39000, lr=0.000320256, gnorm=0.303, clip=1, loss_scale=1, train_wall=132, gb_free=60.4, wall=52987 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023 | valid on 'valid' subset | loss 3.768 | nll_loss 2.223 | ppl 4.67 | wps 214868 | wpb 22263 | bsz 1004 | num_updates 39000 | best_loss 3.758 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 epoch 023: 1668 / 1707 loss=3.579, nll_loss=2.03, ppl=4.08, wps=335553, ups=0.68, wpb=490281, bsz=16148.1, num_updates=39100, lr=0.000319847, gnorm=0.267, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=53133 end of epoch 23 (average epoch stats below) epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 epoch 023 | loss 3.569 | nll_loss 2.018 | ppl 4.05 | wps 361303 | ups 0.74 | wpb 489897 | bsz 16330.2 | num_updates 39139 | lr 0.000319687 | gnorm 0.271 | clip 0.3 | loss_scale 1 | train_wall 2273 | gb_free 61.5 | wall 53185 Start iterating over samples epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 61 / 1707 loss=3.557, nll_loss=2.005, ppl=4.01, wps=365089, ups=0.75, wpb=485597, bsz=16073.4, num_updates=39200, lr=0.000319438, gnorm=0.271, clip=0, loss_scale=1, train_wall=132, gb_free=60.4, wall=53266 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 161 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=365821, ups=0.75, wpb=489338, bsz=16534.6, num_updates=39300, lr=0.000319032, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=53400 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 261 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365764, ups=0.75, wpb=490597, bsz=16566.6, num_updates=39400, lr=0.000318626, gnorm=0.251, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=53534 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 362 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=362599, ups=0.74, wpb=490357, bsz=16316.6, num_updates=39500, lr=0.000318223, gnorm=0.266, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=53670 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 463 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=362869, ups=0.74, wpb=489441, bsz=16369.7, num_updates=39600, lr=0.000317821, gnorm=0.268, clip=0, loss_scale=0.5, train_wall=134, gb_free=60.5, wall=53805 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 563 / 1707 loss=3.557, nll_loss=2.004, ppl=4.01, wps=366012, ups=0.75, wpb=489745, bsz=16302.6, num_updates=39700, lr=0.00031742, gnorm=0.271, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.4, wall=53938 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 663 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=367438, ups=0.75, wpb=491156, bsz=16337.6, num_updates=39800, lr=0.000317021, gnorm=0.249, clip=0, loss_scale=0.5, train_wall=133, gb_free=60.5, wall=54072 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 763 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=367929, ups=0.75, wpb=490282, bsz=16273.7, num_updates=39900, lr=0.000316624, gnorm=0.266, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54205 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 epoch 024: 863 / 1707 loss=3.563, nll_loss=2.011, ppl=4.03, wps=367956, ups=0.75, wpb=490168, bsz=16385.8, num_updates=40000, lr=0.000316228, gnorm=0.251, clip=0, loss_scale=1, train_wall=133, gb_free=60.6, wall=54338 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024 | valid on 'valid' subset | loss 3.761 | nll_loss 2.213 | ppl 4.63 | wps 221296 | wpb 22263 | bsz 1004 | num_updates 40000 | best_loss 3.758 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 963 / 1707 loss=3.565, nll_loss=2.014, ppl=4.04, wps=335880, ups=0.69, wpb=489942, bsz=16515.8, num_updates=40100, lr=0.000315833, gnorm=0.264, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=54484 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1064 / 1707 loss=3.564, nll_loss=2.012, ppl=4.03, wps=364096, ups=0.74, wpb=490405, bsz=16347.3, num_updates=40200, lr=0.00031544, gnorm=0.238, clip=0, loss_scale=1, train_wall=134, gb_free=60.5, wall=54619 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1164 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=366724, ups=0.75, wpb=489960, bsz=16259.6, num_updates=40300, lr=0.000315049, gnorm=0.249, clip=0, loss_scale=1, train_wall=133, gb_free=60.8, wall=54753 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1264 / 1707 loss=3.567, nll_loss=2.017, ppl=4.05, wps=367331, ups=0.75, wpb=490574, bsz=16069.8, num_updates=40400, lr=0.000314658, gnorm=0.247, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=54886 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1364 / 1707 loss=3.564, nll_loss=2.013, ppl=4.04, wps=367118, ups=0.75, wpb=490869, bsz=16449.4, num_updates=40500, lr=0.00031427, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=55020 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1464 / 1707 loss=3.569, nll_loss=2.019, ppl=4.05, wps=365825, ups=0.75, wpb=488990, bsz=16547.7, num_updates=40600, lr=0.000313882, gnorm=0.243, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=55154 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1564 / 1707 loss=3.568, nll_loss=2.018, ppl=4.05, wps=365621, ups=0.75, wpb=490539, bsz=16290.6, num_updates=40700, lr=0.000313497, gnorm=0.241, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=55288 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 epoch 024: 1665 / 1707 loss=3.57, nll_loss=2.02, ppl=4.06, wps=362829, ups=0.74, wpb=490277, bsz=16098.3, num_updates=40800, lr=0.000313112, gnorm=0.236, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=55423 end of epoch 24 (average epoch stats below) epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 epoch 024 | loss 3.561 | nll_loss 2.01 | ppl 4.03 | wps 363768 | ups 0.74 | wpb 489904 | bsz 16332.4 | num_updates 40842 | lr 0.000312951 | gnorm 0.252 | clip 0 | loss_scale 2 | train_wall 2272 | gb_free 61.3 | wall 55478 Start iterating over samples epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 58 / 1707 loss=3.549, nll_loss=1.996, ppl=3.99, wps=364535, ups=0.75, wpb=486012, bsz=16281, num_updates=40900, lr=0.000312729, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=55556 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 epoch 025: 158 / 1707 loss=3.538, nll_loss=1.983, ppl=3.95, wps=367009, ups=0.75, wpb=490817, bsz=16260.3, num_updates=41000, lr=0.000312348, gnorm=0.238, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=55690 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.765 | nll_loss 2.219 | ppl 4.66 | wps 216726 | wpb 22263 | bsz 1004 | num_updates 41000 | best_loss 3.758 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 258 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=334263, ups=0.68, wpb=489092, bsz=16410.5, num_updates=41100, lr=0.000311967, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=55836 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 358 / 1707 loss=3.547, nll_loss=1.993, ppl=3.98, wps=367250, ups=0.75, wpb=489899, bsz=16374.5, num_updates=41200, lr=0.000311588, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=55970 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 459 / 1707 loss=3.544, nll_loss=1.99, ppl=3.97, wps=363623, ups=0.74, wpb=490311, bsz=16293.6, num_updates=41300, lr=0.000311211, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=56104 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 559 / 1707 loss=3.549, nll_loss=1.995, ppl=3.99, wps=367087, ups=0.75, wpb=489591, bsz=16210.7, num_updates=41400, lr=0.000310835, gnorm=0.25, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=56238 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 659 / 1707 loss=3.551, nll_loss=1.998, ppl=3.99, wps=365950, ups=0.75, wpb=489810, bsz=16250, num_updates=41500, lr=0.00031046, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=56372 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 759 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=368617, ups=0.75, wpb=490268, bsz=16301, num_updates=41600, lr=0.000310087, gnorm=0.24, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=56505 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 859 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=365067, ups=0.75, wpb=489281, bsz=16166.2, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=61.2, wall=56639 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 960 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=362355, ups=0.74, wpb=490558, bsz=16424.4, num_updates=41800, lr=0.000309344, gnorm=0.23, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=56774 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1061 / 1707 loss=3.558, nll_loss=2.006, ppl=4.02, wps=363158, ups=0.74, wpb=490175, bsz=16495.8, num_updates=41900, lr=0.000308975, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=56909 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 epoch 025: 1161 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=367336, ups=0.75, wpb=490120, bsz=16330.2, num_updates=42000, lr=0.000308607, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=57043 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 216296 | wpb 22263 | bsz 1004 | num_updates 42000 | best_loss 3.758 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1261 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=323718, ups=0.66, wpb=490728, bsz=16295.9, num_updates=42100, lr=0.00030824, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=57194 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1361 / 1707 loss=3.56, nll_loss=2.008, ppl=4.02, wps=365733, ups=0.75, wpb=489752, bsz=16175.6, num_updates=42200, lr=0.000307875, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=57328 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1461 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=368133, ups=0.75, wpb=490624, bsz=16506.7, num_updates=42300, lr=0.00030751, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=57461 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1562 / 1707 loss=3.559, nll_loss=2.007, ppl=4.02, wps=363621, ups=0.74, wpb=490666, bsz=16234.6, num_updates=42400, lr=0.000307148, gnorm=0.238, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=57596 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 epoch 025: 1662 / 1707 loss=3.559, nll_loss=2.008, ppl=4.02, wps=367488, ups=0.75, wpb=491343, bsz=16586.2, num_updates=42500, lr=0.000306786, gnorm=0.242, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=57730 end of epoch 25 (average epoch stats below) epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 epoch 025 | loss 3.552 | nll_loss 1.999 | ppl 4 | wps 361046 | ups 0.74 | wpb 489907 | bsz 16331.4 | num_updates 42545 | lr 0.000306624 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 60.9 | wall 57789 Start iterating over samples epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 55 / 1707 loss=3.543, nll_loss=1.989, ppl=3.97, wps=365727, ups=0.75, wpb=485855, bsz=16247.4, num_updates=42600, lr=0.000306426, gnorm=0.246, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=57863 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 156 / 1707 loss=3.53, nll_loss=1.974, ppl=3.93, wps=363799, ups=0.74, wpb=489812, bsz=16253.3, num_updates=42700, lr=0.000306067, gnorm=0.239, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=57997 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 256 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=367332, ups=0.75, wpb=491120, bsz=16450.5, num_updates=42800, lr=0.000305709, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58131 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 356 / 1707 loss=3.534, nll_loss=1.979, ppl=3.94, wps=367920, ups=0.75, wpb=491134, bsz=16560.5, num_updates=42900, lr=0.000305352, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=58265 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 epoch 026: 457 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=363897, ups=0.74, wpb=490035, bsz=15956, num_updates=43000, lr=0.000304997, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.9, wall=58399 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026 | valid on 'valid' subset | loss 3.77 | nll_loss 2.221 | ppl 4.66 | wps 216916 | wpb 22263 | bsz 1004 | num_updates 43000 | best_loss 3.758 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 557 / 1707 loss=3.54, nll_loss=1.986, ppl=3.96, wps=335960, ups=0.69, wpb=489595, bsz=16348.6, num_updates=43100, lr=0.000304643, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=58545 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 658 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=363757, ups=0.74, wpb=489645, bsz=16519.8, num_updates=43200, lr=0.00030429, gnorm=0.242, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=58680 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 758 / 1707 loss=3.541, nll_loss=1.987, ppl=3.96, wps=367112, ups=0.75, wpb=489027, bsz=16273.6, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=58813 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 858 / 1707 loss=3.544, nll_loss=1.991, ppl=3.97, wps=366932, ups=0.75, wpb=490212, bsz=16171.8, num_updates=43400, lr=0.000303588, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=58946 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 959 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=364553, ups=0.74, wpb=490621, bsz=16520.6, num_updates=43500, lr=0.000303239, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59081 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1059 / 1707 loss=3.548, nll_loss=1.995, ppl=3.99, wps=364520, ups=0.74, wpb=490733, bsz=16457.8, num_updates=43600, lr=0.000302891, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=59216 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1159 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=365173, ups=0.75, wpb=489282, bsz=16320.8, num_updates=43700, lr=0.000302545, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59350 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1260 / 1707 loss=3.55, nll_loss=1.997, ppl=3.99, wps=363423, ups=0.74, wpb=490979, bsz=16288.2, num_updates=43800, lr=0.000302199, gnorm=0.237, clip=0, loss_scale=2, train_wall=135, gb_free=60.9, wall=59485 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1360 / 1707 loss=3.554, nll_loss=2.002, ppl=4, wps=363373, ups=0.74, wpb=489214, bsz=16452.8, num_updates=43900, lr=0.000301855, gnorm=0.233, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=59619 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 epoch 026: 1460 / 1707 loss=3.553, nll_loss=2.002, ppl=4, wps=367310, ups=0.75, wpb=490044, bsz=16416.6, num_updates=44000, lr=0.000301511, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=59753 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026 | valid on 'valid' subset | loss 3.747 | nll_loss 2.2 | ppl 4.59 | wps 207618 | wpb 22263 | bsz 1004 | num_updates 44000 | best_loss 3.747 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1560 / 1707 loss=3.555, nll_loss=2.003, ppl=4.01, wps=314379, ups=0.64, wpb=490020, bsz=16226.2, num_updates=44100, lr=0.000301169, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=59909 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 epoch 026: 1660 / 1707 loss=3.556, nll_loss=2.004, ppl=4.01, wps=367074, ups=0.75, wpb=489593, bsz=16243.6, num_updates=44200, lr=0.000300828, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=60042 end of epoch 26 (average epoch stats below) epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 epoch 026 | loss 3.544 | nll_loss 1.991 | ppl 3.97 | wps 359930 | ups 0.73 | wpb 489893 | bsz 16333.6 | num_updates 44246 | lr 0.000300672 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2271 | gb_free 62.7 | wall 60104 Start iterating over samples epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 54 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=362999, ups=0.74, wpb=487421, bsz=16028.9, num_updates=44300, lr=0.000300489, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=60176 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 154 / 1707 loss=3.523, nll_loss=1.966, ppl=3.91, wps=366163, ups=0.75, wpb=490566, bsz=16198.9, num_updates=44400, lr=0.00030015, gnorm=0.241, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60310 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 255 / 1707 loss=3.527, nll_loss=1.971, ppl=3.92, wps=363382, ups=0.74, wpb=489422, bsz=16191.7, num_updates=44500, lr=0.000299813, gnorm=0.234, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=60445 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 355 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=367104, ups=0.75, wpb=489975, bsz=16417.4, num_updates=44600, lr=0.000299476, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=60578 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 455 / 1707 loss=3.528, nll_loss=1.972, ppl=3.92, wps=364969, ups=0.74, wpb=490153, bsz=16234.5, num_updates=44700, lr=0.000299141, gnorm=0.232, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60713 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 556 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=364453, ups=0.74, wpb=490300, bsz=16411.4, num_updates=44800, lr=0.000298807, gnorm=0.236, clip=0, loss_scale=4, train_wall=134, gb_free=60.6, wall=60847 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 656 / 1707 loss=3.537, nll_loss=1.982, ppl=3.95, wps=367330, ups=0.75, wpb=489965, bsz=16212.7, num_updates=44900, lr=0.000298474, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=60981 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 epoch 027: 756 / 1707 loss=3.539, nll_loss=1.986, ppl=3.96, wps=366407, ups=0.75, wpb=489276, bsz=16582.9, num_updates=45000, lr=0.000298142, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=61114 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027 | valid on 'valid' subset | loss 3.749 | nll_loss 2.202 | ppl 4.6 | wps 215512 | wpb 22263 | bsz 1004 | num_updates 45000 | best_loss 3.747 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 857 / 1707 loss=3.542, nll_loss=1.988, ppl=3.97, wps=332472, ups=0.68, wpb=490143, bsz=16480, num_updates=45100, lr=0.000297812, gnorm=0.24, clip=0, loss_scale=4, train_wall=135, gb_free=60.3, wall=61262 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 958 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=363755, ups=0.74, wpb=490638, bsz=16257.7, num_updates=45200, lr=0.000297482, gnorm=0.238, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=61397 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1058 / 1707 loss=3.541, nll_loss=1.987, ppl=3.97, wps=365851, ups=0.75, wpb=489196, bsz=16322.4, num_updates=45300, lr=0.000297154, gnorm=0.236, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=61530 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1158 / 1707 loss=3.54, nll_loss=1.987, ppl=3.96, wps=365911, ups=0.75, wpb=490966, bsz=16364.2, num_updates=45400, lr=0.000296826, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=61664 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1258 / 1707 loss=3.543, nll_loss=1.99, ppl=3.97, wps=365932, ups=0.75, wpb=489701, bsz=16334.5, num_updates=45500, lr=0.0002965, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=61798 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1358 / 1707 loss=3.547, nll_loss=1.994, ppl=3.98, wps=366529, ups=0.75, wpb=491554, bsz=16441.2, num_updates=45600, lr=0.000296174, gnorm=0.24, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=61932 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1459 / 1707 loss=3.546, nll_loss=1.993, ppl=3.98, wps=363602, ups=0.74, wpb=491053, bsz=16405, num_updates=45700, lr=0.00029585, gnorm=0.232, clip=0, loss_scale=4, train_wall=135, gb_free=59.6, wall=62067 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1559 / 1707 loss=3.545, nll_loss=1.992, ppl=3.98, wps=366267, ups=0.75, wpb=490326, bsz=16176.9, num_updates=45800, lr=0.000295527, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62201 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 epoch 027: 1660 / 1707 loss=3.549, nll_loss=1.997, ppl=3.99, wps=361987, ups=0.74, wpb=489054, bsz=16503.1, num_updates=45900, lr=0.000295205, gnorm=0.233, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=62336 end of epoch 27 (average epoch stats below) epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 epoch 027 | loss 3.538 | nll_loss 1.984 | ppl 3.95 | wps 363181 | ups 0.74 | wpb 489909 | bsz 16333.3 | num_updates 45947 | lr 0.000295054 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2274 | gb_free 61.8 | wall 62399 Start iterating over samples epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 epoch 028: 53 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=363980, ups=0.75, wpb=485751, bsz=16285, num_updates=46000, lr=0.000294884, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=62470 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.754 | nll_loss 2.209 | ppl 4.62 | wps 209525 | wpb 22263 | bsz 1004 | num_updates 46000 | best_loss 3.747 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 153 / 1707 loss=3.518, nll_loss=1.961, ppl=3.89, wps=334549, ups=0.68, wpb=490336, bsz=16539, num_updates=46100, lr=0.000294564, gnorm=0.218, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=62616 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 254 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=363804, ups=0.74, wpb=489947, bsz=16189.8, num_updates=46200, lr=0.000294245, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=62751 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 354 / 1707 loss=3.521, nll_loss=1.965, ppl=3.9, wps=365168, ups=0.74, wpb=490277, bsz=16463.5, num_updates=46300, lr=0.000293927, gnorm=0.244, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=62885 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 454 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=366368, ups=0.75, wpb=490911, bsz=16314.1, num_updates=46400, lr=0.00029361, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=63019 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 554 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=366143, ups=0.75, wpb=489009, bsz=16354.4, num_updates=46500, lr=0.000293294, gnorm=0.236, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=63153 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 655 / 1707 loss=3.539, nll_loss=1.985, ppl=3.96, wps=361802, ups=0.74, wpb=489497, bsz=16564.6, num_updates=46600, lr=0.000292979, gnorm=0.226, clip=0, loss_scale=2, train_wall=135, gb_free=60.5, wall=63288 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 756 / 1707 loss=3.53, nll_loss=1.975, ppl=3.93, wps=362906, ups=0.74, wpb=489586, bsz=16453.2, num_updates=46700, lr=0.000292666, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.6, wall=63423 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 856 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=366894, ups=0.75, wpb=490290, bsz=16100.6, num_updates=46800, lr=0.000292353, gnorm=0.236, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=63557 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 956 / 1707 loss=3.531, nll_loss=1.977, ppl=3.94, wps=366918, ups=0.75, wpb=489771, bsz=16350.9, num_updates=46900, lr=0.000292041, gnorm=0.238, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=63690 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 epoch 028: 1056 / 1707 loss=3.536, nll_loss=1.982, ppl=3.95, wps=366664, ups=0.75, wpb=491144, bsz=16465.3, num_updates=47000, lr=0.00029173, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=63824 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028 | valid on 'valid' subset | loss 3.752 | nll_loss 2.204 | ppl 4.61 | wps 214155 | wpb 22263 | bsz 1004 | num_updates 47000 | best_loss 3.747 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1156 / 1707 loss=3.538, nll_loss=1.984, ppl=3.96, wps=321701, ups=0.66, wpb=489397, bsz=16354.1, num_updates=47100, lr=0.00029142, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=63976 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1256 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=368153, ups=0.75, wpb=490243, bsz=16345, num_updates=47200, lr=0.000291111, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=64109 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1356 / 1707 loss=3.536, nll_loss=1.981, ppl=3.95, wps=368222, ups=0.75, wpb=490342, bsz=16170, num_updates=47300, lr=0.000290803, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=64243 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1456 / 1707 loss=3.537, nll_loss=1.984, ppl=3.95, wps=366262, ups=0.75, wpb=489879, bsz=16205, num_updates=47400, lr=0.000290496, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=64376 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1557 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=364386, ups=0.74, wpb=490966, bsz=16326.4, num_updates=47500, lr=0.000290191, gnorm=0.227, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=64511 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 epoch 028: 1657 / 1707 loss=3.538, nll_loss=1.985, ppl=3.96, wps=365158, ups=0.75, wpb=489899, bsz=16265, num_updates=47600, lr=0.000289886, gnorm=0.235, clip=0, loss_scale=4, train_wall=134, gb_free=60.5, wall=64645 end of epoch 28 (average epoch stats below) epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 epoch 028 | loss 3.532 | nll_loss 1.977 | ppl 3.94 | wps 360566 | ups 0.74 | wpb 489900 | bsz 16333 | num_updates 47649 | lr 0.000289736 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 62.1 | wall 64711 Start iterating over samples epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 51 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=362596, ups=0.75, wpb=486490, bsz=16111.1, num_updates=47700, lr=0.000289581, gnorm=0.237, clip=0, loss_scale=2, train_wall=133, gb_free=61.5, wall=64779 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 151 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=366475, ups=0.75, wpb=489776, bsz=16458, num_updates=47800, lr=0.000289278, gnorm=0.241, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=64913 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 251 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=368396, ups=0.75, wpb=491040, bsz=16307.8, num_updates=47900, lr=0.000288976, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=65046 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 epoch 029: 352 / 1707 loss=3.516, nll_loss=1.958, ppl=3.89, wps=362701, ups=0.74, wpb=490360, bsz=16315.4, num_updates=48000, lr=0.000288675, gnorm=0.225, clip=0, loss_scale=2, train_wall=135, gb_free=61.5, wall=65182 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.749 | nll_loss 2.2 | ppl 4.59 | wps 212428 | wpb 22263 | bsz 1004 | num_updates 48000 | best_loss 3.747 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 452 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=333682, ups=0.68, wpb=489119, bsz=16545.4, num_updates=48100, lr=0.000288375, gnorm=0.223, clip=0, loss_scale=2, train_wall=132, gb_free=60.4, wall=65328 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 552 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=368493, ups=0.75, wpb=490454, bsz=16295, num_updates=48200, lr=0.000288076, gnorm=0.238, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=65461 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 652 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=367989, ups=0.75, wpb=490842, bsz=16298.6, num_updates=48300, lr=0.000287777, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.8, wall=65595 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 752 / 1707 loss=3.525, nll_loss=1.969, ppl=3.91, wps=366185, ups=0.75, wpb=490185, bsz=16382, num_updates=48400, lr=0.00028748, gnorm=0.231, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=65729 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 853 / 1707 loss=3.526, nll_loss=1.971, ppl=3.92, wps=362620, ups=0.74, wpb=489933, bsz=16341.4, num_updates=48500, lr=0.000287183, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.4, wall=65864 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 953 / 1707 loss=3.531, nll_loss=1.976, ppl=3.94, wps=367219, ups=0.75, wpb=490564, bsz=16424.2, num_updates=48600, lr=0.000286888, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=65997 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1053 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=365892, ups=0.75, wpb=489819, bsz=16483.2, num_updates=48700, lr=0.000286593, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66131 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1155 / 1707 loss=3.532, nll_loss=1.977, ppl=3.94, wps=359669, ups=0.73, wpb=489643, bsz=16197, num_updates=48800, lr=0.000286299, gnorm=0.227, clip=0, loss_scale=2, train_wall=136, gb_free=60.4, wall=66267 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1255 / 1707 loss=3.537, nll_loss=1.983, ppl=3.95, wps=364836, ups=0.74, wpb=489832, bsz=16463.7, num_updates=48900, lr=0.000286006, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=66402 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 epoch 029: 1355 / 1707 loss=3.532, nll_loss=1.978, ppl=3.94, wps=366059, ups=0.75, wpb=490496, bsz=16385.9, num_updates=49000, lr=0.000285714, gnorm=0.222, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=66536 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029 | valid on 'valid' subset | loss 3.765 | nll_loss 2.22 | ppl 4.66 | wps 207286 | wpb 22263 | bsz 1004 | num_updates 49000 | best_loss 3.747 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1455 / 1707 loss=3.534, nll_loss=1.98, ppl=3.94, wps=331743, ups=0.68, wpb=489195, bsz=16341.4, num_updates=49100, lr=0.000285423, gnorm=0.228, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66683 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1555 / 1707 loss=3.533, nll_loss=1.979, ppl=3.94, wps=367079, ups=0.75, wpb=491344, bsz=16150.4, num_updates=49200, lr=0.000285133, gnorm=0.239, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=66817 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 epoch 029: 1656 / 1707 loss=3.535, nll_loss=1.981, ppl=3.95, wps=363979, ups=0.74, wpb=490103, bsz=16330.1, num_updates=49300, lr=0.000284844, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=66952 end of epoch 29 (average epoch stats below) epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 epoch 029 | loss 3.526 | nll_loss 1.971 | ppl 3.92 | wps 361352 | ups 0.74 | wpb 489899 | bsz 16334.3 | num_updates 49351 | lr 0.000284696 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2271 | gb_free 60.8 | wall 67019 Start iterating over samples epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 49 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=366134, ups=0.75, wpb=485296, bsz=15923.3, num_updates=49400, lr=0.000284555, gnorm=0.237, clip=0, loss_scale=2, train_wall=132, gb_free=60.8, wall=67084 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 149 / 1707 loss=3.51, nll_loss=1.952, ppl=3.87, wps=365858, ups=0.75, wpb=489354, bsz=16367.2, num_updates=49500, lr=0.000284268, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=67218 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 249 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=368667, ups=0.75, wpb=490995, bsz=16407.1, num_updates=49600, lr=0.000283981, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=67351 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 349 / 1707 loss=3.512, nll_loss=1.954, ppl=3.87, wps=367851, ups=0.75, wpb=491103, bsz=16184.4, num_updates=49700, lr=0.000283695, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.2, wall=67485 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 451 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=358946, ups=0.73, wpb=490228, bsz=16419.9, num_updates=49800, lr=0.00028341, gnorm=0.236, clip=0, loss_scale=2, train_wall=136, gb_free=61, wall=67621 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 551 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366160, ups=0.75, wpb=489802, bsz=16322.2, num_updates=49900, lr=0.000283126, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=67755 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 epoch 030: 651 / 1707 loss=3.521, nll_loss=1.964, ppl=3.9, wps=366122, ups=0.75, wpb=490326, bsz=16560.9, num_updates=50000, lr=0.000282843, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=67889 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.748 | nll_loss 2.199 | ppl 4.59 | wps 210206 | wpb 22263 | bsz 1004 | num_updates 50000 | best_loss 3.747 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 751 / 1707 loss=3.518, nll_loss=1.962, ppl=3.89, wps=319049, ups=0.65, wpb=490462, bsz=16384.2, num_updates=50100, lr=0.00028256, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=61.3, wall=68043 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 852 / 1707 loss=3.525, nll_loss=1.97, ppl=3.92, wps=362901, ups=0.74, wpb=489756, bsz=16165.8, num_updates=50200, lr=0.000282279, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.6, wall=68177 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 952 / 1707 loss=3.531, nll_loss=1.976, ppl=3.93, wps=365746, ups=0.75, wpb=489283, bsz=16338.4, num_updates=50300, lr=0.000281998, gnorm=0.245, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=68311 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1052 / 1707 loss=3.524, nll_loss=1.969, ppl=3.91, wps=365747, ups=0.75, wpb=489853, bsz=16455.8, num_updates=50400, lr=0.000281718, gnorm=0.224, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=68445 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1153 / 1707 loss=3.527, nll_loss=1.972, ppl=3.92, wps=363271, ups=0.74, wpb=490045, bsz=16117.8, num_updates=50500, lr=0.000281439, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=68580 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1253 / 1707 loss=3.523, nll_loss=1.967, ppl=3.91, wps=366096, ups=0.75, wpb=489643, bsz=16282.6, num_updates=50600, lr=0.000281161, gnorm=0.231, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=68714 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1354 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=362592, ups=0.74, wpb=490605, bsz=16486.6, num_updates=50700, lr=0.000280883, gnorm=0.223, clip=0, loss_scale=1, train_wall=135, gb_free=60.8, wall=68849 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1454 / 1707 loss=3.53, nll_loss=1.976, ppl=3.93, wps=365828, ups=0.75, wpb=490187, bsz=16305.8, num_updates=50800, lr=0.000280607, gnorm=0.232, clip=0, loss_scale=1, train_wall=133, gb_free=60.5, wall=68983 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1554 / 1707 loss=3.528, nll_loss=1.973, ppl=3.93, wps=366966, ups=0.75, wpb=490501, bsz=16441.5, num_updates=50900, lr=0.000280331, gnorm=0.235, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=69117 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 epoch 030: 1654 / 1707 loss=3.533, nll_loss=1.978, ppl=3.94, wps=365550, ups=0.75, wpb=490060, bsz=16287.2, num_updates=51000, lr=0.000280056, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=69251 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 epoch 030 | valid on 'valid' subset | loss 3.753 | nll_loss 2.205 | ppl 4.61 | wps 214580 | wpb 22263 | bsz 1004 | num_updates 51000 | best_loss 3.747 end of epoch 30 (average epoch stats below) epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 epoch 030 | loss 3.521 | nll_loss 1.965 | ppl 3.91 | wps 359348 | ups 0.73 | wpb 489882 | bsz 16330.2 | num_updates 51053 | lr 0.000279911 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.7 | wall 69339 Start iterating over samples epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 47 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=322981, ups=0.66, wpb=487121, bsz=16243.8, num_updates=51100, lr=0.000279782, gnorm=0.244, clip=0, loss_scale=2, train_wall=132, gb_free=60.9, wall=69402 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 148 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364137, ups=0.74, wpb=489761, bsz=16330, num_updates=51200, lr=0.000279508, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69536 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 248 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=365890, ups=0.75, wpb=490598, bsz=16234, num_updates=51300, lr=0.000279236, gnorm=0.239, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=69670 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 348 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=367744, ups=0.75, wpb=489583, bsz=16219.3, num_updates=51400, lr=0.000278964, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=69803 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 449 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=364343, ups=0.74, wpb=490534, bsz=16385, num_updates=51500, lr=0.000278693, gnorm=0.229, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=69938 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 549 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=367175, ups=0.75, wpb=490322, bsz=16446.8, num_updates=51600, lr=0.000278423, gnorm=0.239, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=70072 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 649 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367520, ups=0.75, wpb=489506, bsz=16615.8, num_updates=51700, lr=0.000278154, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=70205 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 749 / 1707 loss=3.517, nll_loss=1.96, ppl=3.89, wps=367779, ups=0.75, wpb=490535, bsz=16336.7, num_updates=51800, lr=0.000277885, gnorm=0.235, clip=0, loss_scale=4, train_wall=133, gb_free=61.4, wall=70338 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 850 / 1707 loss=3.515, nll_loss=1.958, ppl=3.89, wps=364616, ups=0.74, wpb=490757, bsz=16207.1, num_updates=51900, lr=0.000277617, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=70473 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 epoch 031: 950 / 1707 loss=3.519, nll_loss=1.963, ppl=3.9, wps=365010, ups=0.75, wpb=489126, bsz=16437.3, num_updates=52000, lr=0.00027735, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=70607 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031 | valid on 'valid' subset | loss 3.759 | nll_loss 2.213 | ppl 4.64 | wps 211549 | wpb 22263 | bsz 1004 | num_updates 52000 | best_loss 3.747 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1050 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=333917, ups=0.68, wpb=489295, bsz=16331.1, num_updates=52100, lr=0.000277084, gnorm=0.227, clip=0, loss_scale=4, train_wall=133, gb_free=60.1, wall=70753 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1151 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364700, ups=0.74, wpb=490654, bsz=16116.6, num_updates=52200, lr=0.000276818, gnorm=0.223, clip=0, loss_scale=2, train_wall=134, gb_free=60.8, wall=70888 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1252 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=363016, ups=0.74, wpb=490823, bsz=16259.2, num_updates=52300, lr=0.000276553, gnorm=0.227, clip=0, loss_scale=1, train_wall=135, gb_free=60.4, wall=71023 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1352 / 1707 loss=3.522, nll_loss=1.966, ppl=3.91, wps=365075, ups=0.75, wpb=489388, bsz=16316.7, num_updates=52400, lr=0.000276289, gnorm=0.232, clip=0, loss_scale=1, train_wall=134, gb_free=60.7, wall=71157 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1452 / 1707 loss=3.522, nll_loss=1.967, ppl=3.91, wps=367061, ups=0.75, wpb=490912, bsz=16465.3, num_updates=52500, lr=0.000276026, gnorm=0.221, clip=0, loss_scale=1, train_wall=133, gb_free=60.4, wall=71291 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1552 / 1707 loss=3.525, nll_loss=1.971, ppl=3.92, wps=365069, ups=0.75, wpb=489657, bsz=16310.1, num_updates=52600, lr=0.000275764, gnorm=0.235, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=71425 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 epoch 031: 1652 / 1707 loss=3.524, nll_loss=1.968, ppl=3.91, wps=366171, ups=0.75, wpb=489776, bsz=16296.6, num_updates=52700, lr=0.000275502, gnorm=0.223, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=71559 end of epoch 31 (average epoch stats below) epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 epoch 031 | loss 3.516 | nll_loss 1.96 | ppl 3.89 | wps 363492 | ups 0.74 | wpb 489897 | bsz 16329.2 | num_updates 52754 | lr 0.000275361 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 2271 | gb_free 60.9 | wall 71631 Start iterating over samples epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 46 / 1707 loss=3.509, nll_loss=1.951, ppl=3.87, wps=360426, ups=0.74, wpb=486077, bsz=16050.1, num_updates=52800, lr=0.000275241, gnorm=0.229, clip=0, loss_scale=1, train_wall=134, gb_free=60.4, wall=71694 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 146 / 1707 loss=3.498, nll_loss=1.938, ppl=3.83, wps=367359, ups=0.75, wpb=489920, bsz=16229, num_updates=52900, lr=0.000274981, gnorm=0.231, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71827 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 epoch 032: 246 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=366531, ups=0.75, wpb=489835, bsz=16636.6, num_updates=53000, lr=0.000274721, gnorm=0.233, clip=0, loss_scale=1, train_wall=133, gb_free=60.7, wall=71961 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.757 | nll_loss 2.212 | ppl 4.63 | wps 214003 | wpb 22263 | bsz 1004 | num_updates 53000 | best_loss 3.747 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 346 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=334884, ups=0.68, wpb=490784, bsz=16218.9, num_updates=53100, lr=0.000274462, gnorm=0.218, clip=0, loss_scale=2, train_wall=133, gb_free=60.9, wall=72107 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 446 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=367994, ups=0.75, wpb=490390, bsz=16609.1, num_updates=53200, lr=0.000274204, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=72240 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 546 / 1707 loss=3.507, nll_loss=1.949, ppl=3.86, wps=364976, ups=0.74, wpb=490514, bsz=16370.6, num_updates=53300, lr=0.000273947, gnorm=0.226, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=72375 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 646 / 1707 loss=3.508, nll_loss=1.95, ppl=3.86, wps=364869, ups=0.75, wpb=489498, bsz=16306.8, num_updates=53400, lr=0.00027369, gnorm=0.233, clip=0, loss_scale=4, train_wall=134, gb_free=61, wall=72509 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 746 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=366442, ups=0.75, wpb=490397, bsz=16378.8, num_updates=53500, lr=0.000273434, gnorm=0.23, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=72643 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 847 / 1707 loss=3.513, nll_loss=1.956, ppl=3.88, wps=363267, ups=0.74, wpb=490486, bsz=16447.8, num_updates=53600, lr=0.000273179, gnorm=0.224, clip=0, loss_scale=4, train_wall=135, gb_free=61, wall=72778 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 947 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366736, ups=0.75, wpb=490208, bsz=16345.8, num_updates=53700, lr=0.000272925, gnorm=0.217, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=72911 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1048 / 1707 loss=3.512, nll_loss=1.955, ppl=3.88, wps=363193, ups=0.74, wpb=490482, bsz=16199.5, num_updates=53800, lr=0.000272671, gnorm=0.234, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73047 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1148 / 1707 loss=3.516, nll_loss=1.96, ppl=3.89, wps=364699, ups=0.74, wpb=490338, bsz=16290.5, num_updates=53900, lr=0.000272418, gnorm=0.224, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73181 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 epoch 032: 1248 / 1707 loss=3.519, nll_loss=1.964, ppl=3.9, wps=365499, ups=0.75, wpb=490011, bsz=16191.1, num_updates=54000, lr=0.000272166, gnorm=0.221, clip=0, loss_scale=4, train_wall=134, gb_free=60.8, wall=73315 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032 | valid on 'valid' subset | loss 3.751 | nll_loss 2.206 | ppl 4.62 | wps 207408 | wpb 22263 | bsz 1004 | num_updates 54000 | best_loss 3.747 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1349 / 1707 loss=3.518, nll_loss=1.962, ppl=3.9, wps=329955, ups=0.67, wpb=489060, bsz=16319.6, num_updates=54100, lr=0.000271914, gnorm=0.233, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=73463 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1450 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=364654, ups=0.74, wpb=491613, bsz=16294.8, num_updates=54200, lr=0.000271663, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=73598 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1550 / 1707 loss=3.521, nll_loss=1.966, ppl=3.91, wps=363986, ups=0.74, wpb=489472, bsz=16492.1, num_updates=54300, lr=0.000271413, gnorm=0.234, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=73733 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 epoch 032: 1650 / 1707 loss=3.52, nll_loss=1.965, ppl=3.9, wps=363739, ups=0.75, wpb=487903, bsz=16291.7, num_updates=54400, lr=0.000271163, gnorm=0.225, clip=0, loss_scale=4, train_wall=134, gb_free=60.3, wall=73867 end of epoch 32 (average epoch stats below) epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 epoch 032 | loss 3.512 | nll_loss 1.955 | ppl 3.88 | wps 360897 | ups 0.74 | wpb 489895 | bsz 16332.3 | num_updates 54456 | lr 0.000271024 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 2275 | gb_free 61.7 | wall 73942 Start iterating over samples epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 44 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=362842, ups=0.74, wpb=487444, bsz=16372, num_updates=54500, lr=0.000270914, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=74001 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 144 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365861, ups=0.75, wpb=490745, bsz=16605.1, num_updates=54600, lr=0.000270666, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74135 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 244 / 1707 loss=3.496, nll_loss=1.936, ppl=3.83, wps=367543, ups=0.75, wpb=491152, bsz=16200.2, num_updates=54700, lr=0.000270418, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=74269 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 345 / 1707 loss=3.497, nll_loss=1.938, ppl=3.83, wps=364849, ups=0.74, wpb=491304, bsz=16665.4, num_updates=54800, lr=0.000270172, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.4, wall=74403 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 445 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=364493, ups=0.74, wpb=489863, bsz=16491.8, num_updates=54900, lr=0.000269925, gnorm=0.231, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=74538 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 epoch 033: 545 / 1707 loss=3.504, nll_loss=1.946, ppl=3.85, wps=367026, ups=0.75, wpb=490090, bsz=16307.3, num_updates=55000, lr=0.00026968, gnorm=0.219, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=74671 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.762 | nll_loss 2.215 | ppl 4.64 | wps 218644 | wpb 22263 | bsz 1004 | num_updates 55000 | best_loss 3.747 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 646 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=319598, ups=0.65, wpb=489305, bsz=16339, num_updates=55100, lr=0.000269435, gnorm=0.23, clip=0, loss_scale=2, train_wall=135, gb_free=60.6, wall=74825 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 746 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=368229, ups=0.75, wpb=490899, bsz=16216, num_updates=55200, lr=0.000269191, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=74958 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 846 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365487, ups=0.75, wpb=489198, bsz=16334.5, num_updates=55300, lr=0.000268947, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.7, wall=75092 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 947 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=363368, ups=0.74, wpb=489467, bsz=16408.9, num_updates=55400, lr=0.000268705, gnorm=0.217, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75226 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1047 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=366777, ups=0.75, wpb=490390, bsz=16430.1, num_updates=55500, lr=0.000268462, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=75360 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1147 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=365552, ups=0.75, wpb=489264, bsz=16103.7, num_updates=55600, lr=0.000268221, gnorm=0.232, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=75494 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1248 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365131, ups=0.74, wpb=490793, bsz=16189.4, num_updates=55700, lr=0.00026798, gnorm=0.216, clip=0, loss_scale=2, train_wall=134, gb_free=60.7, wall=75628 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1348 / 1707 loss=3.514, nll_loss=1.957, ppl=3.88, wps=365148, ups=0.74, wpb=490990, bsz=16264.8, num_updates=55800, lr=0.00026774, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=75763 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1448 / 1707 loss=3.515, nll_loss=1.959, ppl=3.89, wps=365474, ups=0.75, wpb=489898, bsz=16237.4, num_updates=55900, lr=0.0002675, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=75897 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 epoch 033: 1548 / 1707 loss=3.517, nll_loss=1.961, ppl=3.89, wps=366008, ups=0.75, wpb=489468, bsz=16313.9, num_updates=56000, lr=0.000267261, gnorm=0.234, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=76031 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033 | valid on 'valid' subset | loss 3.76 | nll_loss 2.213 | ppl 4.64 | wps 202614 | wpb 22263 | bsz 1004 | num_updates 56000 | best_loss 3.747 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 epoch 033: 1648 / 1707 loss=3.52, nll_loss=1.964, ppl=3.9, wps=333341, ups=0.68, wpb=489560, bsz=16238.2, num_updates=56100, lr=0.000267023, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=76177 end of epoch 33 (average epoch stats below) epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 epoch 033 | loss 3.507 | nll_loss 1.95 | ppl 3.86 | wps 360592 | ups 0.74 | wpb 489892 | bsz 16331.7 | num_updates 56159 | lr 0.000266883 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2273 | gb_free 60.4 | wall 76255 Start iterating over samples epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 41 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=364750, ups=0.75, wpb=485801, bsz=16150.2, num_updates=56200, lr=0.000266785, gnorm=0.23, clip=0, loss_scale=4, train_wall=132, gb_free=60.4, wall=76311 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 142 / 1707 loss=3.487, nll_loss=1.927, ppl=3.8, wps=364059, ups=0.74, wpb=489448, bsz=16203, num_updates=56300, lr=0.000266548, gnorm=0.223, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=76445 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 242 / 1707 loss=3.496, nll_loss=1.937, ppl=3.83, wps=364787, ups=0.75, wpb=489079, bsz=16458.2, num_updates=56400, lr=0.000266312, gnorm=0.228, clip=0, loss_scale=4, train_wall=134, gb_free=60.2, wall=76579 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 343 / 1707 loss=3.495, nll_loss=1.935, ppl=3.82, wps=362179, ups=0.74, wpb=489583, bsz=16155.4, num_updates=56500, lr=0.000266076, gnorm=0.228, clip=0, loss_scale=4, train_wall=135, gb_free=60.6, wall=76714 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 444 / 1707 loss=3.495, nll_loss=1.936, ppl=3.83, wps=364592, ups=0.74, wpb=491826, bsz=16404.5, num_updates=56600, lr=0.000265841, gnorm=0.227, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=76849 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 544 / 1707 loss=3.5, nll_loss=1.941, ppl=3.84, wps=366717, ups=0.75, wpb=489727, bsz=16271, num_updates=56700, lr=0.000265606, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=76983 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 644 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=366195, ups=0.75, wpb=489934, bsz=16388.6, num_updates=56800, lr=0.000265372, gnorm=0.234, clip=0, loss_scale=2, train_wall=133, gb_free=60.6, wall=77117 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 744 / 1707 loss=3.503, nll_loss=1.945, ppl=3.85, wps=367239, ups=0.75, wpb=489831, bsz=15959.2, num_updates=56900, lr=0.000265139, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.9, wall=77250 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 epoch 034: 845 / 1707 loss=3.506, nll_loss=1.949, ppl=3.86, wps=363485, ups=0.74, wpb=490342, bsz=16471.4, num_updates=57000, lr=0.000264906, gnorm=0.228, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=77385 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 213823 | wpb 22263 | bsz 1004 | num_updates 57000 | best_loss 3.747 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 945 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=323800, ups=0.66, wpb=490426, bsz=16482.8, num_updates=57100, lr=0.000264674, gnorm=0.225, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=77536 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1045 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366518, ups=0.75, wpb=490992, bsz=16276.4, num_updates=57200, lr=0.000264443, gnorm=0.233, clip=0, loss_scale=4, train_wall=133, gb_free=60.4, wall=77670 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1146 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=362198, ups=0.74, wpb=490068, bsz=16176.6, num_updates=57300, lr=0.000264212, gnorm=0.234, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=77806 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1246 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=367396, ups=0.75, wpb=490891, bsz=16504.5, num_updates=57400, lr=0.000263982, gnorm=0.233, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=77939 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1346 / 1707 loss=3.511, nll_loss=1.954, ppl=3.87, wps=365766, ups=0.75, wpb=490031, bsz=16320.5, num_updates=57500, lr=0.000263752, gnorm=0.224, clip=0, loss_scale=4, train_wall=133, gb_free=60.6, wall=78073 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1446 / 1707 loss=3.509, nll_loss=1.952, ppl=3.87, wps=366059, ups=0.75, wpb=490063, bsz=16301, num_updates=57600, lr=0.000263523, gnorm=0.237, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=78207 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1547 / 1707 loss=3.514, nll_loss=1.958, ppl=3.88, wps=362534, ups=0.74, wpb=489513, bsz=16581.2, num_updates=57700, lr=0.000263295, gnorm=0.231, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=78342 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 epoch 034: 1647 / 1707 loss=3.511, nll_loss=1.955, ppl=3.88, wps=366425, ups=0.75, wpb=489454, bsz=16547, num_updates=57800, lr=0.000263067, gnorm=0.228, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=78476 end of epoch 34 (average epoch stats below) epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 epoch 034 | loss 3.503 | nll_loss 1.945 | ppl 3.85 | wps 362405 | ups 0.74 | wpb 489895 | bsz 16333.4 | num_updates 57860 | lr 0.00026293 | gnorm 0.23 | clip 0 | loss_scale 2 | train_wall 2273 | gb_free 61.2 | wall 78555 Start iterating over samples epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 40 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365541, ups=0.75, wpb=486536, bsz=16221.1, num_updates=57900, lr=0.00026284, gnorm=0.235, clip=0, loss_scale=4, train_wall=132, gb_free=60.3, wall=78609 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 epoch 035: 141 / 1707 loss=3.483, nll_loss=1.923, ppl=3.79, wps=361699, ups=0.74, wpb=490623, bsz=16242, num_updates=58000, lr=0.000262613, gnorm=0.221, clip=0, loss_scale=2, train_wall=135, gb_free=60.7, wall=78744 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.761 | nll_loss 2.215 | ppl 4.64 | wps 209362 | wpb 22263 | bsz 1004 | num_updates 58000 | best_loss 3.747 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 241 / 1707 loss=3.49, nll_loss=1.93, ppl=3.81, wps=334663, ups=0.68, wpb=490142, bsz=16213.9, num_updates=58100, lr=0.000262387, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.7, wall=78891 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 341 / 1707 loss=3.493, nll_loss=1.933, ppl=3.82, wps=364167, ups=0.74, wpb=489304, bsz=16591.7, num_updates=58200, lr=0.000262161, gnorm=0.23, clip=0, loss_scale=4, train_wall=134, gb_free=60.7, wall=79025 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 441 / 1707 loss=3.492, nll_loss=1.933, ppl=3.82, wps=365855, ups=0.75, wpb=489386, bsz=16342.6, num_updates=58300, lr=0.000261936, gnorm=0.225, clip=0, loss_scale=4, train_wall=133, gb_free=60.5, wall=79159 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 543 / 1707 loss=3.493, nll_loss=1.934, ppl=3.82, wps=359357, ups=0.73, wpb=490616, bsz=16270.3, num_updates=58400, lr=0.000261712, gnorm=0.227, clip=0, loss_scale=1, train_wall=136, gb_free=60.5, wall=79296 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 643 / 1707 loss=3.496, nll_loss=1.938, ppl=3.83, wps=368647, ups=0.75, wpb=490008, bsz=16156.7, num_updates=58500, lr=0.000261488, gnorm=0.242, clip=0, loss_scale=1, train_wall=132, gb_free=60.2, wall=79428 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 743 / 1707 loss=3.498, nll_loss=1.939, ppl=3.83, wps=365371, ups=0.75, wpb=489934, bsz=16254.2, num_updates=58600, lr=0.000261265, gnorm=0.225, clip=0, loss_scale=2, train_wall=134, gb_free=60.9, wall=79563 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 843 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=365830, ups=0.75, wpb=488810, bsz=16307.7, num_updates=58700, lr=0.000261042, gnorm=0.232, clip=0, loss_scale=2, train_wall=133, gb_free=60.5, wall=79696 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 943 / 1707 loss=3.499, nll_loss=1.941, ppl=3.84, wps=366880, ups=0.75, wpb=491200, bsz=16239.5, num_updates=58800, lr=0.00026082, gnorm=0.219, clip=0, loss_scale=2, train_wall=133, gb_free=60.8, wall=79830 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1043 / 1707 loss=3.506, nll_loss=1.948, ppl=3.86, wps=365861, ups=0.75, wpb=489981, bsz=16527.7, num_updates=58900, lr=0.000260599, gnorm=0.229, clip=0, loss_scale=4, train_wall=133, gb_free=61.1, wall=79964 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 epoch 035: 1144 / 1707 loss=3.502, nll_loss=1.945, ppl=3.85, wps=362466, ups=0.74, wpb=491638, bsz=16551.4, num_updates=59000, lr=0.000260378, gnorm=0.229, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=80100 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035 | valid on 'valid' subset | loss 3.757 | nll_loss 2.215 | ppl 4.64 | wps 208610 | wpb 22263 | bsz 1004 | num_updates 59000 | best_loss 3.747 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1244 / 1707 loss=3.505, nll_loss=1.948, ppl=3.86, wps=323030, ups=0.66, wpb=489881, bsz=16350.3, num_updates=59100, lr=0.000260157, gnorm=0.222, clip=0, loss_scale=2, train_wall=133, gb_free=60.3, wall=80251 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1344 / 1707 loss=3.504, nll_loss=1.947, ppl=3.86, wps=365599, ups=0.74, wpb=490954, bsz=16289.3, num_updates=59200, lr=0.000259938, gnorm=0.226, clip=0, loss_scale=2, train_wall=134, gb_free=60.5, wall=80386 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1445 / 1707 loss=3.508, nll_loss=1.951, ppl=3.87, wps=363272, ups=0.74, wpb=489854, bsz=16142.6, num_updates=59300, lr=0.000259718, gnorm=0.224, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=80520 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1545 / 1707 loss=3.502, nll_loss=1.944, ppl=3.85, wps=366636, ups=0.75, wpb=490216, bsz=16143.7, num_updates=59400, lr=0.0002595, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80654 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 epoch 035: 1645 / 1707 loss=3.51, nll_loss=1.953, ppl=3.87, wps=366556, ups=0.75, wpb=490133, bsz=16642.6, num_updates=59500, lr=0.000259281, gnorm=0.227, clip=0, loss_scale=2, train_wall=133, gb_free=60.4, wall=80788 end of epoch 35 (average epoch stats below) epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 epoch 035 | loss 3.499 | nll_loss 1.941 | ppl 3.84 | wps 360198 | ups 0.74 | wpb 489901 | bsz 16328.5 | num_updates 59562 | lr 0.000259147 | gnorm 0.227 | clip 0 | loss_scale 4 | train_wall 2275 | gb_free 61.1 | wall 80870 Start iterating over samples epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 38 / 1707 loss=3.5, nll_loss=1.942, ppl=3.84, wps=365438, ups=0.75, wpb=486518, bsz=16278.8, num_updates=59600, lr=0.000259064, gnorm=0.223, clip=0, loss_scale=4, train_wall=132, gb_free=60.5, wall=80921 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 139 / 1707 loss=3.482, nll_loss=1.921, ppl=3.79, wps=362398, ups=0.74, wpb=489770, bsz=16369.6, num_updates=59700, lr=0.000258847, gnorm=0.224, clip=0, loss_scale=2, train_wall=135, gb_free=60.4, wall=81056 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 239 / 1707 loss=3.483, nll_loss=1.922, ppl=3.79, wps=365054, ups=0.75, wpb=489871, bsz=16383, num_updates=59800, lr=0.00025863, gnorm=0.236, clip=0, loss_scale=2, train_wall=134, gb_free=60.3, wall=81190 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 339 / 1707 loss=3.489, nll_loss=1.929, ppl=3.81, wps=367008, ups=0.75, wpb=490191, bsz=16228.6, num_updates=59900, lr=0.000258414, gnorm=0.226, clip=0, loss_scale=4, train_wall=133, gb_free=60.3, wall=81324 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 epoch 036: 439 / 1707 loss=3.494, nll_loss=1.935, ppl=3.82, wps=365324, ups=0.74, wpb=490620, bsz=16456.7, num_updates=60000, lr=0.000258199, gnorm=0.218, clip=0, loss_scale=4, train_wall=134, gb_free=60.4, wall=81458 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 epoch 036 | valid on 'valid' subset | loss 3.761 | nll_loss 2.217 | ppl 4.65 | wps 220709 | wpb 22263 | bsz 1004 | num_updates 60000 | best_loss 3.747 end of epoch 36 (average epoch stats below) epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 epoch 036 | loss 3.486 | nll_loss 1.926 | ppl 3.8 | wps 356972 | ups 0.73 | wpb 490189 | bsz 16365.3 | num_updates 60000 | lr 0.000258199 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 586 | gb_free 60.4 | wall 81471 done training in 81455.3 seconds