{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/ja-en.do03.ado00/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:47967', 'distributed_port': 47967, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/ja-en.do03.ado00', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/ja-en.do03.ado00/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/ja-en.do03.ado00', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/ja-en/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.3, attention_dropout=0.0, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/ja-en/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=16000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1689 loss=12.171, nll_loss=11.867, ppl=3734.99, wps=541980, ups=1.09, wpb=495110, bsz=16550.1, num_updates=100, lr=2.5e-05, gnorm=2.374, clip=69, loss_scale=4, train_wall=92, gb_free=21.5, wall=111 epoch 001: 202 / 1689 loss=10.629, nll_loss=10.097, ppl=1095.04, wps=552005, ups=1.12, wpb=494700, bsz=16986.4, num_updates=200, lr=5e-05, gnorm=1.778, clip=96, loss_scale=2, train_wall=89, gb_free=22.3, wall=200 epoch 001: 302 / 1689 loss=9.992, nll_loss=9.34, ppl=647.99, wps=560746, ups=1.13, wpb=496400, bsz=16631, num_updates=300, lr=7.5e-05, gnorm=1.878, clip=100, loss_scale=2, train_wall=88, gb_free=21.7, wall=289 epoch 001: 402 / 1689 loss=9.394, nll_loss=8.634, ppl=397.16, wps=552628, ups=1.12, wpb=494890, bsz=16561.9, num_updates=400, lr=0.0001, gnorm=1.715, clip=100, loss_scale=2, train_wall=89, gb_free=21.8, wall=378 epoch 001: 502 / 1689 loss=8.964, nll_loss=8.124, ppl=278.91, wps=552838, ups=1.12, wpb=495046, bsz=16582.4, num_updates=500, lr=0.000125, gnorm=1.582, clip=99, loss_scale=2, train_wall=89, gb_free=22.2, wall=468 epoch 001: 602 / 1689 loss=8.623, nll_loss=7.72, ppl=210.78, wps=549350, ups=1.11, wpb=495666, bsz=16514.8, num_updates=600, lr=0.00015, gnorm=1.403, clip=98, loss_scale=2, train_wall=89, gb_free=22.4, wall=558 epoch 001: 702 / 1689 loss=8.293, nll_loss=7.335, ppl=161.42, wps=550022, ups=1.11, wpb=494766, bsz=16314.2, num_updates=700, lr=0.000175, gnorm=1.315, clip=94, loss_scale=4, train_wall=89, gb_free=21.7, wall=648 epoch 001: 802 / 1689 loss=7.985, nll_loss=6.977, ppl=125.98, wps=555698, ups=1.12, wpb=496202, bsz=16400.6, num_updates=800, lr=0.0002, gnorm=1.201, clip=84, loss_scale=4, train_wall=88, gb_free=21.9, wall=737 epoch 001: 902 / 1689 loss=7.656, nll_loss=6.597, ppl=96.84, wps=556415, ups=1.12, wpb=496562, bsz=16617.4, num_updates=900, lr=0.000225, gnorm=1.07, clip=60, loss_scale=4, train_wall=88, gb_free=22.2, wall=827 epoch 001: 1002 / 1689 loss=7.361, nll_loss=6.256, ppl=76.41, wps=555140, ups=1.12, wpb=496189, bsz=16704.8, num_updates=1000, lr=0.00025, gnorm=1.023, clip=56, loss_scale=4, train_wall=88, gb_free=21.8, wall=916 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 7.103 | nll_loss 5.902 | ppl 59.78 | wps 0 | wpb 44526 | bsz 2008 | num_updates 1000 epoch 001: 1102 / 1689 loss=7.067, nll_loss=5.916, ppl=60.39, wps=464321, ups=0.94, wpb=494438, bsz=16503.4, num_updates=1100, lr=0.000275, gnorm=0.951, clip=30, loss_scale=4, train_wall=88, gb_free=20.6, wall=1023 epoch 001: 1202 / 1689 loss=6.771, nll_loss=5.577, ppl=47.72, wps=553736, ups=1.12, wpb=494853, bsz=16237.2, num_updates=1200, lr=0.0003, gnorm=0.916, clip=31, loss_scale=8, train_wall=88, gb_free=22.1, wall=1112 epoch 001: 1302 / 1689 loss=6.445, nll_loss=5.203, ppl=36.85, wps=547689, ups=1.11, wpb=493588, bsz=16373.8, num_updates=1300, lr=0.000325, gnorm=0.884, clip=23, loss_scale=8, train_wall=89, gb_free=21.7, wall=1202 epoch 001: 1402 / 1689 loss=6.113, nll_loss=4.823, ppl=28.31, wps=554595, ups=1.12, wpb=497384, bsz=16279.9, num_updates=1400, lr=0.00035, gnorm=0.839, clip=18, loss_scale=8, train_wall=88, gb_free=22.2, wall=1292 epoch 001: 1502 / 1689 loss=5.807, nll_loss=4.475, ppl=22.24, wps=552374, ups=1.12, wpb=494548, bsz=16575.5, num_updates=1500, lr=0.000375, gnorm=0.786, clip=10, loss_scale=8, train_wall=88, gb_free=22.1, wall=1381 epoch 001: 1602 / 1689 loss=5.557, nll_loss=4.194, ppl=18.3, wps=553292, ups=1.12, wpb=495810, bsz=16558.1, num_updates=1600, lr=0.0004, gnorm=0.699, clip=4, loss_scale=8, train_wall=88, gb_free=21.8, wall=1471 end of epoch 1 (average epoch stats below) epoch 001 | loss 7.916 | nll_loss 6.915 | ppl 120.65 | wps 546155 | ups 1.1 | wpb 495118 | bsz 16506.1 | num_updates 1687 | lr 0.00042175 | gnorm 1.244 | clip 57.7 | loss_scale 8 | train_wall 1492 | gb_free 21.6 | wall 1549 Start iterating over samples epoch 002: 14 / 1689 loss=5.389, nll_loss=4.005, ppl=16.06, wps=538245, ups=1.1, wpb=490638, bsz=16281.8, num_updates=1700, lr=0.000425, gnorm=0.67, clip=2, loss_scale=8, train_wall=89, gb_free=22, wall=1562 epoch 002: 14 / 1689 loss=5.389, nll_loss=4.005, ppl=16.06, wps=538245, ups=1.1, wpb=490638, bsz=16281.8, num_updates=1700, lr=0.000425, gnorm=0.67, clip=2, loss_scale=8, train_wall=89, gb_free=22, wall=1562 epoch 002: 114 / 1689 loss=5.234, nll_loss=3.833, ppl=14.25, wps=547721, ups=1.1, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.592, clip=4, loss_scale=8, train_wall=89, gb_free=21.8, wall=1653 epoch 002: 114 / 1689 loss=5.234, nll_loss=3.833, ppl=14.25, wps=547721, ups=1.1, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.592, clip=4, loss_scale=8, train_wall=89, gb_free=21.8, wall=1653 epoch 002: 214 / 1689 loss=5.113, nll_loss=3.698, ppl=12.98, wps=545902, ups=1.1, wpb=494526, bsz=16524.7, num_updates=1900, lr=0.000475, gnorm=0.564, clip=1, loss_scale=8, train_wall=89, gb_free=22.2, wall=1743 epoch 002: 214 / 1689 loss=5.113, nll_loss=3.698, ppl=12.98, wps=545902, ups=1.1, wpb=494526, bsz=16524.7, num_updates=1900, lr=0.000475, gnorm=0.564, clip=1, loss_scale=8, train_wall=89, gb_free=22.2, wall=1743 epoch 002: 315 / 1689 loss=4.997, nll_loss=3.571, ppl=11.88, wps=541992, ups=1.1, wpb=494202, bsz=16792.9, num_updates=2000, lr=0.0005, gnorm=0.506, clip=1, loss_scale=4, train_wall=89, gb_free=21.8, wall=1834 epoch 002: 315 / 1689 loss=4.997, nll_loss=3.571, ppl=11.88, wps=541992, ups=1.1, wpb=494202, bsz=16792.9, num_updates=2000, lr=0.0005, gnorm=0.506, clip=1, loss_scale=4, train_wall=89, gb_free=21.8, wall=1834 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.988 | nll_loss 3.453 | ppl 10.95 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 4.988 epoch 002 | valid on 'valid' subset | loss 4.988 | nll_loss 3.453 | ppl 10.95 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 4.988 epoch 002: 415 / 1689 loss=4.901, nll_loss=3.465, ppl=11.04, wps=452795, ups=0.91, wpb=495100, bsz=16294.4, num_updates=2100, lr=0.000525, gnorm=0.493, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=1944 epoch 002: 415 / 1689 loss=4.901, nll_loss=3.465, ppl=11.04, wps=452795, ups=0.91, wpb=495100, bsz=16294.4, num_updates=2100, lr=0.000525, gnorm=0.493, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=1944 epoch 002: 515 / 1689 loss=4.816, nll_loss=3.371, ppl=10.35, wps=549390, ups=1.11, wpb=494927, bsz=16567.2, num_updates=2200, lr=0.00055, gnorm=0.462, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=2034 epoch 002: 515 / 1689 loss=4.816, nll_loss=3.371, ppl=10.35, wps=549390, ups=1.11, wpb=494927, bsz=16567.2, num_updates=2200, lr=0.00055, gnorm=0.462, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=2034 epoch 002: 615 / 1689 loss=4.743, nll_loss=3.292, ppl=9.79, wps=549235, ups=1.11, wpb=495521, bsz=16630.4, num_updates=2300, lr=0.000575, gnorm=0.448, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=2124 epoch 002: 615 / 1689 loss=4.743, nll_loss=3.292, ppl=9.79, wps=549235, ups=1.11, wpb=495521, bsz=16630.4, num_updates=2300, lr=0.000575, gnorm=0.448, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=2124 epoch 002: 715 / 1689 loss=4.69, nll_loss=3.234, ppl=9.41, wps=551748, ups=1.11, wpb=495264, bsz=16162, num_updates=2400, lr=0.0006, gnorm=0.428, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=2214 epoch 002: 715 / 1689 loss=4.69, nll_loss=3.234, ppl=9.41, wps=551748, ups=1.11, wpb=495264, bsz=16162, num_updates=2400, lr=0.0006, gnorm=0.428, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=2214 epoch 002: 815 / 1689 loss=4.625, nll_loss=3.163, ppl=8.96, wps=548910, ups=1.11, wpb=494583, bsz=16776.5, num_updates=2500, lr=0.000625, gnorm=0.416, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=2304 epoch 002: 815 / 1689 loss=4.625, nll_loss=3.163, ppl=8.96, wps=548910, ups=1.11, wpb=494583, bsz=16776.5, num_updates=2500, lr=0.000625, gnorm=0.416, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=2304 epoch 002: 916 / 1689 loss=4.579, nll_loss=3.113, ppl=8.65, wps=549291, ups=1.11, wpb=495583, bsz=16559.8, num_updates=2600, lr=0.00065, gnorm=0.414, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=2394 epoch 002: 916 / 1689 loss=4.579, nll_loss=3.113, ppl=8.65, wps=549291, ups=1.11, wpb=495583, bsz=16559.8, num_updates=2600, lr=0.00065, gnorm=0.414, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=2394 epoch 002: 1016 / 1689 loss=4.538, nll_loss=3.07, ppl=8.4, wps=553412, ups=1.12, wpb=495174, bsz=16381.9, num_updates=2700, lr=0.000675, gnorm=0.416, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=2484 epoch 002: 1016 / 1689 loss=4.538, nll_loss=3.07, ppl=8.4, wps=553412, ups=1.12, wpb=495174, bsz=16381.9, num_updates=2700, lr=0.000675, gnorm=0.416, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=2484 epoch 002: 1116 / 1689 loss=4.505, nll_loss=3.034, ppl=8.19, wps=555745, ups=1.12, wpb=494962, bsz=16373.1, num_updates=2800, lr=0.0007, gnorm=0.391, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=2573 epoch 002: 1116 / 1689 loss=4.505, nll_loss=3.034, ppl=8.19, wps=555745, ups=1.12, wpb=494962, bsz=16373.1, num_updates=2800, lr=0.0007, gnorm=0.391, clip=0, loss_scale=4, train_wall=87, gb_free=22.3, wall=2573 epoch 002: 1216 / 1689 loss=4.46, nll_loss=2.985, ppl=7.92, wps=551475, ups=1.11, wpb=496100, bsz=16472.5, num_updates=2900, lr=0.000725, gnorm=0.372, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=2663 epoch 002: 1216 / 1689 loss=4.46, nll_loss=2.985, ppl=7.92, wps=551475, ups=1.11, wpb=496100, bsz=16472.5, num_updates=2900, lr=0.000725, gnorm=0.372, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=2663 epoch 002: 1316 / 1689 loss=4.429, nll_loss=2.952, ppl=7.74, wps=550382, ups=1.11, wpb=496192, bsz=16642.6, num_updates=3000, lr=0.00075, gnorm=0.382, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=2753 epoch 002: 1316 / 1689 loss=4.429, nll_loss=2.952, ppl=7.74, wps=550382, ups=1.11, wpb=496192, bsz=16642.6, num_updates=3000, lr=0.00075, gnorm=0.382, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=2753 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.399 | nll_loss 2.833 | ppl 7.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.399 epoch 002 | valid on 'valid' subset | loss 4.399 | nll_loss 2.833 | ppl 7.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.399 epoch 002: 1417 / 1689 loss=4.398, nll_loss=2.918, ppl=7.56, wps=449176, ups=0.91, wpb=494700, bsz=16270.8, num_updates=3100, lr=0.000775, gnorm=0.377, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=2863 epoch 002: 1417 / 1689 loss=4.398, nll_loss=2.918, ppl=7.56, wps=449176, ups=0.91, wpb=494700, bsz=16270.8, num_updates=3100, lr=0.000775, gnorm=0.377, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=2863 epoch 002: 1517 / 1689 loss=4.379, nll_loss=2.898, ppl=7.45, wps=552893, ups=1.12, wpb=495455, bsz=16419.5, num_updates=3200, lr=0.0008, gnorm=0.382, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=2953 epoch 002: 1517 / 1689 loss=4.379, nll_loss=2.898, ppl=7.45, wps=552893, ups=1.12, wpb=495455, bsz=16419.5, num_updates=3200, lr=0.0008, gnorm=0.382, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=2953 epoch 002: 1617 / 1689 loss=4.351, nll_loss=2.867, ppl=7.3, wps=554731, ups=1.12, wpb=497280, bsz=16313, num_updates=3300, lr=0.000825, gnorm=0.374, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3042 epoch 002: 1617 / 1689 loss=4.351, nll_loss=2.867, ppl=7.3, wps=554731, ups=1.12, wpb=497280, bsz=16313, num_updates=3300, lr=0.000825, gnorm=0.374, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3042 end of epoch 2 (average epoch stats below) epoch 002 | loss 4.663 | nll_loss 3.206 | ppl 9.23 | wps 535539 | ups 1.08 | wpb 495113 | bsz 16498.3 | num_updates 3372 | lr 0.000843 | gnorm 0.438 | clip 0.4 | loss_scale 4 | train_wall 1488 | gb_free 25.7 | wall 3107 epoch 002 | loss 4.663 | nll_loss 3.206 | ppl 9.23 | wps 535539 | ups 1.08 | wpb 495113 | bsz 16498.3 | num_updates 3372 | lr 0.000843 | gnorm 0.438 | clip 0.4 | loss_scale 4 | train_wall 1488 | gb_free 25.7 | wall 3107 Start iterating over samples epoch 003: 28 / 1689 loss=4.339, nll_loss=2.855, ppl=7.24, wps=545494, ups=1.11, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.383, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=3132 epoch 003: 28 / 1689 loss=4.339, nll_loss=2.855, ppl=7.24, wps=545494, ups=1.11, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.383, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=3132 epoch 003: 28 / 1689 loss=4.339, nll_loss=2.855, ppl=7.24, wps=545494, ups=1.11, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.383, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=3132 epoch 003: 128 / 1689 loss=4.303, nll_loss=2.816, ppl=7.04, wps=551248, ups=1.11, wpb=495133, bsz=16849.4, num_updates=3500, lr=0.000875, gnorm=0.353, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=3222 epoch 003: 128 / 1689 loss=4.303, nll_loss=2.816, ppl=7.04, wps=551248, ups=1.11, wpb=495133, bsz=16849.4, num_updates=3500, lr=0.000875, gnorm=0.353, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=3222 epoch 003: 128 / 1689 loss=4.303, nll_loss=2.816, ppl=7.04, wps=551248, ups=1.11, wpb=495133, bsz=16849.4, num_updates=3500, lr=0.000875, gnorm=0.353, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=3222 epoch 003: 229 / 1689 loss=4.298, nll_loss=2.81, ppl=7.01, wps=547364, ups=1.1, wpb=495535, bsz=16859, num_updates=3600, lr=0.0009, gnorm=0.378, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=3313 epoch 003: 229 / 1689 loss=4.298, nll_loss=2.81, ppl=7.01, wps=547364, ups=1.1, wpb=495535, bsz=16859, num_updates=3600, lr=0.0009, gnorm=0.378, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=3313 epoch 003: 229 / 1689 loss=4.298, nll_loss=2.81, ppl=7.01, wps=547364, ups=1.1, wpb=495535, bsz=16859, num_updates=3600, lr=0.0009, gnorm=0.378, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=3313 epoch 003: 329 / 1689 loss=4.284, nll_loss=2.796, ppl=6.94, wps=550901, ups=1.11, wpb=494780, bsz=16521.4, num_updates=3700, lr=0.000925, gnorm=0.37, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=3402 epoch 003: 329 / 1689 loss=4.284, nll_loss=2.796, ppl=6.94, wps=550901, ups=1.11, wpb=494780, bsz=16521.4, num_updates=3700, lr=0.000925, gnorm=0.37, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=3402 epoch 003: 329 / 1689 loss=4.284, nll_loss=2.796, ppl=6.94, wps=550901, ups=1.11, wpb=494780, bsz=16521.4, num_updates=3700, lr=0.000925, gnorm=0.37, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=3402 epoch 003: 430 / 1689 loss=4.275, nll_loss=2.787, ppl=6.9, wps=545468, ups=1.1, wpb=494646, bsz=16387.3, num_updates=3800, lr=0.00095, gnorm=0.373, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=3493 epoch 003: 430 / 1689 loss=4.275, nll_loss=2.787, ppl=6.9, wps=545468, ups=1.1, wpb=494646, bsz=16387.3, num_updates=3800, lr=0.00095, gnorm=0.373, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=3493 epoch 003: 430 / 1689 loss=4.275, nll_loss=2.787, ppl=6.9, wps=545468, ups=1.1, wpb=494646, bsz=16387.3, num_updates=3800, lr=0.00095, gnorm=0.373, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=3493 epoch 003: 530 / 1689 loss=4.26, nll_loss=2.77, ppl=6.82, wps=553892, ups=1.12, wpb=496154, bsz=16503, num_updates=3900, lr=0.000975, gnorm=0.388, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=3583 epoch 003: 530 / 1689 loss=4.26, nll_loss=2.77, ppl=6.82, wps=553892, ups=1.12, wpb=496154, bsz=16503, num_updates=3900, lr=0.000975, gnorm=0.388, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=3583 epoch 003: 530 / 1689 loss=4.26, nll_loss=2.77, ppl=6.82, wps=553892, ups=1.12, wpb=496154, bsz=16503, num_updates=3900, lr=0.000975, gnorm=0.388, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=3583 epoch 003: 630 / 1689 loss=4.244, nll_loss=2.753, ppl=6.74, wps=553749, ups=1.12, wpb=495619, bsz=16485.3, num_updates=4000, lr=0.001, gnorm=0.36, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=3672 epoch 003: 630 / 1689 loss=4.244, nll_loss=2.753, ppl=6.74, wps=553749, ups=1.12, wpb=495619, bsz=16485.3, num_updates=4000, lr=0.001, gnorm=0.36, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=3672 epoch 003: 630 / 1689 loss=4.244, nll_loss=2.753, ppl=6.74, wps=553749, ups=1.12, wpb=495619, bsz=16485.3, num_updates=4000, lr=0.001, gnorm=0.36, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=3672 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.224 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.224 epoch 003 | valid on 'valid' subset | loss 4.224 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.224 epoch 003 | valid on 'valid' subset | loss 4.224 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.224 epoch 003: 730 / 1689 loss=4.24, nll_loss=2.749, ppl=6.72, wps=456802, ups=0.92, wpb=496107, bsz=16321.5, num_updates=4100, lr=0.00098773, gnorm=0.362, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=3781 epoch 003: 730 / 1689 loss=4.24, nll_loss=2.749, ppl=6.72, wps=456802, ups=0.92, wpb=496107, bsz=16321.5, num_updates=4100, lr=0.00098773, gnorm=0.362, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=3781 epoch 003: 730 / 1689 loss=4.24, nll_loss=2.749, ppl=6.72, wps=456802, ups=0.92, wpb=496107, bsz=16321.5, num_updates=4100, lr=0.00098773, gnorm=0.362, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=3781 epoch 003: 830 / 1689 loss=4.225, nll_loss=2.732, ppl=6.65, wps=547766, ups=1.11, wpb=493256, bsz=16294.1, num_updates=4200, lr=0.0009759, gnorm=0.355, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=3871 epoch 003: 830 / 1689 loss=4.225, nll_loss=2.732, ppl=6.65, wps=547766, ups=1.11, wpb=493256, bsz=16294.1, num_updates=4200, lr=0.0009759, gnorm=0.355, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=3871 epoch 003: 830 / 1689 loss=4.225, nll_loss=2.732, ppl=6.65, wps=547766, ups=1.11, wpb=493256, bsz=16294.1, num_updates=4200, lr=0.0009759, gnorm=0.355, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=3871 epoch 003: 930 / 1689 loss=4.211, nll_loss=2.718, ppl=6.58, wps=549850, ups=1.11, wpb=494080, bsz=16401.6, num_updates=4300, lr=0.000964486, gnorm=0.347, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=3961 epoch 003: 930 / 1689 loss=4.211, nll_loss=2.718, ppl=6.58, wps=549850, ups=1.11, wpb=494080, bsz=16401.6, num_updates=4300, lr=0.000964486, gnorm=0.347, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=3961 epoch 003: 930 / 1689 loss=4.211, nll_loss=2.718, ppl=6.58, wps=549850, ups=1.11, wpb=494080, bsz=16401.6, num_updates=4300, lr=0.000964486, gnorm=0.347, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=3961 epoch 003: 1030 / 1689 loss=4.202, nll_loss=2.709, ppl=6.54, wps=557517, ups=1.12, wpb=496673, bsz=16402.4, num_updates=4400, lr=0.000953463, gnorm=0.345, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=4050 epoch 003: 1030 / 1689 loss=4.202, nll_loss=2.709, ppl=6.54, wps=557517, ups=1.12, wpb=496673, bsz=16402.4, num_updates=4400, lr=0.000953463, gnorm=0.345, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=4050 epoch 003: 1030 / 1689 loss=4.202, nll_loss=2.709, ppl=6.54, wps=557517, ups=1.12, wpb=496673, bsz=16402.4, num_updates=4400, lr=0.000953463, gnorm=0.345, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=4050 epoch 003: 1130 / 1689 loss=4.177, nll_loss=2.682, ppl=6.42, wps=551051, ups=1.11, wpb=496084, bsz=16964.2, num_updates=4500, lr=0.000942809, gnorm=0.339, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=4140 epoch 003: 1130 / 1689 loss=4.177, nll_loss=2.682, ppl=6.42, wps=551051, ups=1.11, wpb=496084, bsz=16964.2, num_updates=4500, lr=0.000942809, gnorm=0.339, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=4140 epoch 003: 1130 / 1689 loss=4.177, nll_loss=2.682, ppl=6.42, wps=551051, ups=1.11, wpb=496084, bsz=16964.2, num_updates=4500, lr=0.000942809, gnorm=0.339, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=4140 epoch 003: 1230 / 1689 loss=4.173, nll_loss=2.677, ppl=6.4, wps=548829, ups=1.11, wpb=494071, bsz=16463.8, num_updates=4600, lr=0.000932505, gnorm=0.341, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4230 epoch 003: 1230 / 1689 loss=4.173, nll_loss=2.677, ppl=6.4, wps=548829, ups=1.11, wpb=494071, bsz=16463.8, num_updates=4600, lr=0.000932505, gnorm=0.341, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4230 epoch 003: 1230 / 1689 loss=4.173, nll_loss=2.677, ppl=6.4, wps=548829, ups=1.11, wpb=494071, bsz=16463.8, num_updates=4600, lr=0.000932505, gnorm=0.341, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4230 epoch 003: 1330 / 1689 loss=4.161, nll_loss=2.664, ppl=6.34, wps=547625, ups=1.1, wpb=495849, bsz=16334.4, num_updates=4700, lr=0.000922531, gnorm=0.333, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4320 epoch 003: 1330 / 1689 loss=4.161, nll_loss=2.664, ppl=6.34, wps=547625, ups=1.1, wpb=495849, bsz=16334.4, num_updates=4700, lr=0.000922531, gnorm=0.333, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4320 epoch 003: 1330 / 1689 loss=4.161, nll_loss=2.664, ppl=6.34, wps=547625, ups=1.1, wpb=495849, bsz=16334.4, num_updates=4700, lr=0.000922531, gnorm=0.333, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4320 epoch 003: 1431 / 1689 loss=4.145, nll_loss=2.647, ppl=6.27, wps=547690, ups=1.1, wpb=496274, bsz=16696.8, num_updates=4800, lr=0.000912871, gnorm=0.324, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=4411 epoch 003: 1431 / 1689 loss=4.145, nll_loss=2.647, ppl=6.27, wps=547690, ups=1.1, wpb=496274, bsz=16696.8, num_updates=4800, lr=0.000912871, gnorm=0.324, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=4411 epoch 003: 1431 / 1689 loss=4.145, nll_loss=2.647, ppl=6.27, wps=547690, ups=1.1, wpb=496274, bsz=16696.8, num_updates=4800, lr=0.000912871, gnorm=0.324, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=4411 epoch 003: 1531 / 1689 loss=4.133, nll_loss=2.635, ppl=6.21, wps=555869, ups=1.12, wpb=495920, bsz=16447.6, num_updates=4900, lr=0.000903508, gnorm=0.327, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=4500 epoch 003: 1531 / 1689 loss=4.133, nll_loss=2.635, ppl=6.21, wps=555869, ups=1.12, wpb=495920, bsz=16447.6, num_updates=4900, lr=0.000903508, gnorm=0.327, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=4500 epoch 003: 1531 / 1689 loss=4.133, nll_loss=2.635, ppl=6.21, wps=555869, ups=1.12, wpb=495920, bsz=16447.6, num_updates=4900, lr=0.000903508, gnorm=0.327, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=4500 epoch 003: 1631 / 1689 loss=4.13, nll_loss=2.632, ppl=6.2, wps=556512, ups=1.12, wpb=495532, bsz=16098.1, num_updates=5000, lr=0.000894427, gnorm=0.325, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=4589 epoch 003: 1631 / 1689 loss=4.13, nll_loss=2.632, ppl=6.2, wps=556512, ups=1.12, wpb=495532, bsz=16098.1, num_updates=5000, lr=0.000894427, gnorm=0.325, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=4589 epoch 003: 1631 / 1689 loss=4.13, nll_loss=2.632, ppl=6.2, wps=556512, ups=1.12, wpb=495532, bsz=16098.1, num_updates=5000, lr=0.000894427, gnorm=0.325, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=4589 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.085 | nll_loss 2.523 | ppl 5.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.085 epoch 003 | valid on 'valid' subset | loss 4.085 | nll_loss 2.523 | ppl 5.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.085 epoch 003 | valid on 'valid' subset | loss 4.085 | nll_loss 2.523 | ppl 5.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.085 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.215 | nll_loss 2.722 | ppl 6.6 | wps 537683 | ups 1.09 | wpb 495119 | bsz 16505.4 | num_updates 5058 | lr 0.000889284 | gnorm 0.352 | clip 0 | loss_scale 4 | train_wall 1486 | gb_free 23.3 | wall 4659 epoch 003 | loss 4.215 | nll_loss 2.722 | ppl 6.6 | wps 537683 | ups 1.09 | wpb 495119 | bsz 16505.4 | num_updates 5058 | lr 0.000889284 | gnorm 0.352 | clip 0 | loss_scale 4 | train_wall 1486 | gb_free 23.3 | wall 4659 epoch 003 | loss 4.215 | nll_loss 2.722 | ppl 6.6 | wps 537683 | ups 1.09 | wpb 495119 | bsz 16505.4 | num_updates 5058 | lr 0.000889284 | gnorm 0.352 | clip 0 | loss_scale 4 | train_wall 1486 | gb_free 23.3 | wall 4659 Start iterating over samples epoch 004: 42 / 1689 loss=4.107, nll_loss=2.606, ppl=6.09, wps=413343, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.325, clip=0, loss_scale=4, train_wall=95, gb_free=22.2, wall=4708 epoch 004: 42 / 1689 loss=4.107, nll_loss=2.606, ppl=6.09, wps=413343, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.325, clip=0, loss_scale=4, train_wall=95, gb_free=22.2, wall=4708 epoch 004: 42 / 1689 loss=4.107, nll_loss=2.606, ppl=6.09, wps=413343, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.325, clip=0, loss_scale=4, train_wall=95, gb_free=22.2, wall=4708 epoch 004: 42 / 1689 loss=4.107, nll_loss=2.606, ppl=6.09, wps=413343, ups=0.84, wpb=491659, bsz=16823.7, num_updates=5100, lr=0.000885615, gnorm=0.325, clip=0, loss_scale=4, train_wall=95, gb_free=22.2, wall=4708 epoch 004: 142 / 1689 loss=4.097, nll_loss=2.595, ppl=6.04, wps=556021, ups=1.12, wpb=496061, bsz=16378.9, num_updates=5200, lr=0.000877058, gnorm=0.318, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4797 epoch 004: 142 / 1689 loss=4.097, nll_loss=2.595, ppl=6.04, wps=556021, ups=1.12, wpb=496061, bsz=16378.9, num_updates=5200, lr=0.000877058, gnorm=0.318, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4797 epoch 004: 142 / 1689 loss=4.097, nll_loss=2.595, ppl=6.04, wps=556021, ups=1.12, wpb=496061, bsz=16378.9, num_updates=5200, lr=0.000877058, gnorm=0.318, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4797 epoch 004: 142 / 1689 loss=4.097, nll_loss=2.595, ppl=6.04, wps=556021, ups=1.12, wpb=496061, bsz=16378.9, num_updates=5200, lr=0.000877058, gnorm=0.318, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=4797 epoch 004: 243 / 1689 loss=4.087, nll_loss=2.584, ppl=6, wps=550022, ups=1.11, wpb=496275, bsz=16334, num_updates=5300, lr=0.000868744, gnorm=0.312, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=4888 epoch 004: 243 / 1689 loss=4.087, nll_loss=2.584, ppl=6, wps=550022, ups=1.11, wpb=496275, bsz=16334, num_updates=5300, lr=0.000868744, gnorm=0.312, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=4888 epoch 004: 243 / 1689 loss=4.087, nll_loss=2.584, ppl=6, wps=550022, ups=1.11, wpb=496275, bsz=16334, num_updates=5300, lr=0.000868744, gnorm=0.312, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=4888 epoch 004: 243 / 1689 loss=4.087, nll_loss=2.584, ppl=6, wps=550022, ups=1.11, wpb=496275, bsz=16334, num_updates=5300, lr=0.000868744, gnorm=0.312, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=4888 epoch 004: 343 / 1689 loss=4.08, nll_loss=2.577, ppl=5.97, wps=551748, ups=1.12, wpb=494056, bsz=16740.5, num_updates=5400, lr=0.000860663, gnorm=0.323, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=4977 epoch 004: 343 / 1689 loss=4.08, nll_loss=2.577, ppl=5.97, wps=551748, ups=1.12, wpb=494056, bsz=16740.5, num_updates=5400, lr=0.000860663, gnorm=0.323, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=4977 epoch 004: 343 / 1689 loss=4.08, nll_loss=2.577, ppl=5.97, wps=551748, ups=1.12, wpb=494056, bsz=16740.5, num_updates=5400, lr=0.000860663, gnorm=0.323, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=4977 epoch 004: 343 / 1689 loss=4.08, nll_loss=2.577, ppl=5.97, wps=551748, ups=1.12, wpb=494056, bsz=16740.5, num_updates=5400, lr=0.000860663, gnorm=0.323, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=4977 epoch 004: 443 / 1689 loss=4.067, nll_loss=2.563, ppl=5.91, wps=557522, ups=1.12, wpb=496846, bsz=16651.6, num_updates=5500, lr=0.000852803, gnorm=0.317, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=5066 epoch 004: 443 / 1689 loss=4.067, nll_loss=2.563, ppl=5.91, wps=557522, ups=1.12, wpb=496846, bsz=16651.6, num_updates=5500, lr=0.000852803, gnorm=0.317, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=5066 epoch 004: 443 / 1689 loss=4.067, nll_loss=2.563, ppl=5.91, wps=557522, ups=1.12, wpb=496846, bsz=16651.6, num_updates=5500, lr=0.000852803, gnorm=0.317, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=5066 epoch 004: 443 / 1689 loss=4.067, nll_loss=2.563, ppl=5.91, wps=557522, ups=1.12, wpb=496846, bsz=16651.6, num_updates=5500, lr=0.000852803, gnorm=0.317, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=5066 epoch 004: 543 / 1689 loss=4.077, nll_loss=2.574, ppl=5.96, wps=557279, ups=1.12, wpb=495837, bsz=16469.4, num_updates=5600, lr=0.000845154, gnorm=0.306, clip=0, loss_scale=2, train_wall=88, gb_free=22.8, wall=5155 epoch 004: 543 / 1689 loss=4.077, nll_loss=2.574, ppl=5.96, wps=557279, ups=1.12, wpb=495837, bsz=16469.4, num_updates=5600, lr=0.000845154, gnorm=0.306, clip=0, loss_scale=2, train_wall=88, gb_free=22.8, wall=5155 epoch 004: 543 / 1689 loss=4.077, nll_loss=2.574, ppl=5.96, wps=557279, ups=1.12, wpb=495837, bsz=16469.4, num_updates=5600, lr=0.000845154, gnorm=0.306, clip=0, loss_scale=2, train_wall=88, gb_free=22.8, wall=5155 epoch 004: 543 / 1689 loss=4.077, nll_loss=2.574, ppl=5.96, wps=557279, ups=1.12, wpb=495837, bsz=16469.4, num_updates=5600, lr=0.000845154, gnorm=0.306, clip=0, loss_scale=2, train_wall=88, gb_free=22.8, wall=5155 epoch 004: 643 / 1689 loss=4.061, nll_loss=2.557, ppl=5.89, wps=552153, ups=1.12, wpb=494337, bsz=16538.4, num_updates=5700, lr=0.000837708, gnorm=0.316, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=5245 epoch 004: 643 / 1689 loss=4.061, nll_loss=2.557, ppl=5.89, wps=552153, ups=1.12, wpb=494337, bsz=16538.4, num_updates=5700, lr=0.000837708, gnorm=0.316, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=5245 epoch 004: 643 / 1689 loss=4.061, nll_loss=2.557, ppl=5.89, wps=552153, ups=1.12, wpb=494337, bsz=16538.4, num_updates=5700, lr=0.000837708, gnorm=0.316, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=5245 epoch 004: 643 / 1689 loss=4.061, nll_loss=2.557, ppl=5.89, wps=552153, ups=1.12, wpb=494337, bsz=16538.4, num_updates=5700, lr=0.000837708, gnorm=0.316, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=5245 epoch 004: 743 / 1689 loss=4.054, nll_loss=2.55, ppl=5.86, wps=552952, ups=1.11, wpb=495997, bsz=16521.5, num_updates=5800, lr=0.000830455, gnorm=0.307, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5335 epoch 004: 743 / 1689 loss=4.054, nll_loss=2.55, ppl=5.86, wps=552952, ups=1.11, wpb=495997, bsz=16521.5, num_updates=5800, lr=0.000830455, gnorm=0.307, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5335 epoch 004: 743 / 1689 loss=4.054, nll_loss=2.55, ppl=5.86, wps=552952, ups=1.11, wpb=495997, bsz=16521.5, num_updates=5800, lr=0.000830455, gnorm=0.307, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5335 epoch 004: 743 / 1689 loss=4.054, nll_loss=2.55, ppl=5.86, wps=552952, ups=1.11, wpb=495997, bsz=16521.5, num_updates=5800, lr=0.000830455, gnorm=0.307, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5335 epoch 004: 843 / 1689 loss=4.059, nll_loss=2.556, ppl=5.88, wps=552995, ups=1.12, wpb=495011, bsz=16309.6, num_updates=5900, lr=0.000823387, gnorm=0.303, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5424 epoch 004: 843 / 1689 loss=4.059, nll_loss=2.556, ppl=5.88, wps=552995, ups=1.12, wpb=495011, bsz=16309.6, num_updates=5900, lr=0.000823387, gnorm=0.303, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5424 epoch 004: 843 / 1689 loss=4.059, nll_loss=2.556, ppl=5.88, wps=552995, ups=1.12, wpb=495011, bsz=16309.6, num_updates=5900, lr=0.000823387, gnorm=0.303, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5424 epoch 004: 843 / 1689 loss=4.059, nll_loss=2.556, ppl=5.88, wps=552995, ups=1.12, wpb=495011, bsz=16309.6, num_updates=5900, lr=0.000823387, gnorm=0.303, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=5424 epoch 004: 943 / 1689 loss=4.039, nll_loss=2.534, ppl=5.79, wps=555002, ups=1.12, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.31, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5513 epoch 004: 943 / 1689 loss=4.039, nll_loss=2.534, ppl=5.79, wps=555002, ups=1.12, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.31, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5513 epoch 004: 943 / 1689 loss=4.039, nll_loss=2.534, ppl=5.79, wps=555002, ups=1.12, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.31, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5513 epoch 004: 943 / 1689 loss=4.039, nll_loss=2.534, ppl=5.79, wps=555002, ups=1.12, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.31, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=5513 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.015 | nll_loss 2.445 | ppl 5.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.015 epoch 004 | valid on 'valid' subset | loss 4.015 | nll_loss 2.445 | ppl 5.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.015 epoch 004 | valid on 'valid' subset | loss 4.015 | nll_loss 2.445 | ppl 5.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.015 epoch 004 | valid on 'valid' subset | loss 4.015 | nll_loss 2.445 | ppl 5.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.015 epoch 004: 1043 / 1689 loss=4.034, nll_loss=2.529, ppl=5.77, wps=392973, ups=0.79, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.307, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=5639 epoch 004: 1043 / 1689 loss=4.034, nll_loss=2.529, ppl=5.77, wps=392973, ups=0.79, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.307, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=5639 epoch 004: 1043 / 1689 loss=4.034, nll_loss=2.529, ppl=5.77, wps=392973, ups=0.79, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.307, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=5639 epoch 004: 1043 / 1689 loss=4.034, nll_loss=2.529, ppl=5.77, wps=392973, ups=0.79, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.307, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=5639 epoch 004: 1143 / 1689 loss=4.034, nll_loss=2.528, ppl=5.77, wps=560229, ups=1.13, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5728 epoch 004: 1143 / 1689 loss=4.034, nll_loss=2.528, ppl=5.77, wps=560229, ups=1.13, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5728 epoch 004: 1143 / 1689 loss=4.034, nll_loss=2.528, ppl=5.77, wps=560229, ups=1.13, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5728 epoch 004: 1143 / 1689 loss=4.034, nll_loss=2.528, ppl=5.77, wps=560229, ups=1.13, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.294, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=5728 epoch 004: 1243 / 1689 loss=4.026, nll_loss=2.52, ppl=5.74, wps=561477, ups=1.13, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.296, clip=0, loss_scale=8, train_wall=88, gb_free=21.9, wall=5816 epoch 004: 1243 / 1689 loss=4.026, nll_loss=2.52, ppl=5.74, wps=561477, ups=1.13, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.296, clip=0, loss_scale=8, train_wall=88, gb_free=21.9, wall=5816 epoch 004: 1243 / 1689 loss=4.026, nll_loss=2.52, ppl=5.74, wps=561477, ups=1.13, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.296, clip=0, loss_scale=8, train_wall=88, gb_free=21.9, wall=5816 epoch 004: 1243 / 1689 loss=4.026, nll_loss=2.52, ppl=5.74, wps=561477, ups=1.13, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.296, clip=0, loss_scale=8, train_wall=88, gb_free=21.9, wall=5816 epoch 004: 1344 / 1689 loss=4.024, nll_loss=2.518, ppl=5.73, wps=550035, ups=1.11, wpb=494481, bsz=16660.2, num_updates=6400, lr=0.000790569, gnorm=0.3, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=5906 epoch 004: 1344 / 1689 loss=4.024, nll_loss=2.518, ppl=5.73, wps=550035, ups=1.11, wpb=494481, bsz=16660.2, num_updates=6400, lr=0.000790569, gnorm=0.3, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=5906 epoch 004: 1344 / 1689 loss=4.024, nll_loss=2.518, ppl=5.73, wps=550035, ups=1.11, wpb=494481, bsz=16660.2, num_updates=6400, lr=0.000790569, gnorm=0.3, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=5906 epoch 004: 1344 / 1689 loss=4.024, nll_loss=2.518, ppl=5.73, wps=550035, ups=1.11, wpb=494481, bsz=16660.2, num_updates=6400, lr=0.000790569, gnorm=0.3, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=5906 epoch 004: 1444 / 1689 loss=4.016, nll_loss=2.509, ppl=5.69, wps=557134, ups=1.13, wpb=494377, bsz=16748.4, num_updates=6500, lr=0.000784465, gnorm=0.309, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5995 epoch 004: 1444 / 1689 loss=4.016, nll_loss=2.509, ppl=5.69, wps=557134, ups=1.13, wpb=494377, bsz=16748.4, num_updates=6500, lr=0.000784465, gnorm=0.309, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5995 epoch 004: 1444 / 1689 loss=4.016, nll_loss=2.509, ppl=5.69, wps=557134, ups=1.13, wpb=494377, bsz=16748.4, num_updates=6500, lr=0.000784465, gnorm=0.309, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5995 epoch 004: 1444 / 1689 loss=4.016, nll_loss=2.509, ppl=5.69, wps=557134, ups=1.13, wpb=494377, bsz=16748.4, num_updates=6500, lr=0.000784465, gnorm=0.309, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5995 epoch 004: 1545 / 1689 loss=4.013, nll_loss=2.506, ppl=5.68, wps=554651, ups=1.12, wpb=494570, bsz=16400.1, num_updates=6600, lr=0.000778499, gnorm=0.305, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=6084 epoch 004: 1545 / 1689 loss=4.013, nll_loss=2.506, ppl=5.68, wps=554651, ups=1.12, wpb=494570, bsz=16400.1, num_updates=6600, lr=0.000778499, gnorm=0.305, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=6084 epoch 004: 1545 / 1689 loss=4.013, nll_loss=2.506, ppl=5.68, wps=554651, ups=1.12, wpb=494570, bsz=16400.1, num_updates=6600, lr=0.000778499, gnorm=0.305, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=6084 epoch 004: 1545 / 1689 loss=4.013, nll_loss=2.506, ppl=5.68, wps=554651, ups=1.12, wpb=494570, bsz=16400.1, num_updates=6600, lr=0.000778499, gnorm=0.305, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=6084 epoch 004: 1645 / 1689 loss=4.001, nll_loss=2.493, ppl=5.63, wps=561102, ups=1.13, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.294, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=6172 epoch 004: 1645 / 1689 loss=4.001, nll_loss=2.493, ppl=5.63, wps=561102, ups=1.13, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.294, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=6172 epoch 004: 1645 / 1689 loss=4.001, nll_loss=2.493, ppl=5.63, wps=561102, ups=1.13, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.294, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=6172 epoch 004: 1645 / 1689 loss=4.001, nll_loss=2.493, ppl=5.63, wps=561102, ups=1.13, wpb=497136, bsz=16175.6, num_updates=6700, lr=0.000772667, gnorm=0.294, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=6172 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.048 | nll_loss 2.543 | ppl 5.83 | wps 537971 | ups 1.09 | wpb 495128 | bsz 16500 | num_updates 6744 | lr 0.000770143 | gnorm 0.308 | clip 0 | loss_scale 2 | train_wall 1489 | gb_free 23.5 | wall 6211 epoch 004 | loss 4.048 | nll_loss 2.543 | ppl 5.83 | wps 537971 | ups 1.09 | wpb 495128 | bsz 16500 | num_updates 6744 | lr 0.000770143 | gnorm 0.308 | clip 0 | loss_scale 2 | train_wall 1489 | gb_free 23.5 | wall 6211 epoch 004 | loss 4.048 | nll_loss 2.543 | ppl 5.83 | wps 537971 | ups 1.09 | wpb 495128 | bsz 16500 | num_updates 6744 | lr 0.000770143 | gnorm 0.308 | clip 0 | loss_scale 2 | train_wall 1489 | gb_free 23.5 | wall 6211 epoch 004 | loss 4.048 | nll_loss 2.543 | ppl 5.83 | wps 537971 | ups 1.09 | wpb 495128 | bsz 16500 | num_updates 6744 | lr 0.000770143 | gnorm 0.308 | clip 0 | loss_scale 2 | train_wall 1489 | gb_free 23.5 | wall 6211 Start iterating over samples epoch 005: 56 / 1689 loss=3.993, nll_loss=2.484, ppl=5.59, wps=547187, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.297, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=6262 epoch 005: 56 / 1689 loss=3.993, nll_loss=2.484, ppl=5.59, wps=547187, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.297, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=6262 epoch 005: 56 / 1689 loss=3.993, nll_loss=2.484, ppl=5.59, wps=547187, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.297, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=6262 epoch 005: 56 / 1689 loss=3.993, nll_loss=2.484, ppl=5.59, wps=547187, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.297, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=6262 epoch 005: 56 / 1689 loss=3.993, nll_loss=2.484, ppl=5.59, wps=547187, ups=1.11, wpb=490858, bsz=16249.8, num_updates=6800, lr=0.000766965, gnorm=0.297, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=6262 epoch 005: 156 / 1689 loss=3.978, nll_loss=2.467, ppl=5.53, wps=559935, ups=1.13, wpb=495192, bsz=16462.1, num_updates=6900, lr=0.000761387, gnorm=0.296, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=6350 epoch 005: 156 / 1689 loss=3.978, nll_loss=2.467, ppl=5.53, wps=559935, ups=1.13, wpb=495192, bsz=16462.1, num_updates=6900, lr=0.000761387, gnorm=0.296, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=6350 epoch 005: 156 / 1689 loss=3.978, nll_loss=2.467, ppl=5.53, wps=559935, ups=1.13, wpb=495192, bsz=16462.1, num_updates=6900, lr=0.000761387, gnorm=0.296, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=6350 epoch 005: 156 / 1689 loss=3.978, nll_loss=2.467, ppl=5.53, wps=559935, ups=1.13, wpb=495192, bsz=16462.1, num_updates=6900, lr=0.000761387, gnorm=0.296, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=6350 epoch 005: 156 / 1689 loss=3.978, nll_loss=2.467, ppl=5.53, wps=559935, ups=1.13, wpb=495192, bsz=16462.1, num_updates=6900, lr=0.000761387, gnorm=0.296, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=6350 epoch 005: 256 / 1689 loss=3.972, nll_loss=2.46, ppl=5.5, wps=553284, ups=1.12, wpb=494521, bsz=16393, num_updates=7000, lr=0.000755929, gnorm=0.298, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=6440 epoch 005: 256 / 1689 loss=3.972, nll_loss=2.46, ppl=5.5, wps=553284, ups=1.12, wpb=494521, bsz=16393, num_updates=7000, lr=0.000755929, gnorm=0.298, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=6440 epoch 005: 256 / 1689 loss=3.972, nll_loss=2.46, ppl=5.5, wps=553284, ups=1.12, wpb=494521, bsz=16393, num_updates=7000, lr=0.000755929, gnorm=0.298, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=6440 epoch 005: 256 / 1689 loss=3.972, nll_loss=2.46, ppl=5.5, wps=553284, ups=1.12, wpb=494521, bsz=16393, num_updates=7000, lr=0.000755929, gnorm=0.298, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=6440 epoch 005: 256 / 1689 loss=3.972, nll_loss=2.46, ppl=5.5, wps=553284, ups=1.12, wpb=494521, bsz=16393, num_updates=7000, lr=0.000755929, gnorm=0.298, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=6440 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.959 | nll_loss 2.389 | ppl 5.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.959 epoch 005 | valid on 'valid' subset | loss 3.959 | nll_loss 2.389 | ppl 5.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.959 epoch 005 | valid on 'valid' subset | loss 3.959 | nll_loss 2.389 | ppl 5.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.959 epoch 005 | valid on 'valid' subset | loss 3.959 | nll_loss 2.389 | ppl 5.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.959 epoch 005 | valid on 'valid' subset | loss 3.959 | nll_loss 2.389 | ppl 5.24 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.959 epoch 005: 356 / 1689 loss=3.978, nll_loss=2.468, ppl=5.53, wps=459462, ups=0.93, wpb=495062, bsz=16475.9, num_updates=7100, lr=0.000750587, gnorm=0.298, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=6548 epoch 005: 356 / 1689 loss=3.978, nll_loss=2.468, ppl=5.53, wps=459462, ups=0.93, wpb=495062, bsz=16475.9, num_updates=7100, lr=0.000750587, gnorm=0.298, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=6548 epoch 005: 356 / 1689 loss=3.978, nll_loss=2.468, ppl=5.53, wps=459462, ups=0.93, wpb=495062, bsz=16475.9, num_updates=7100, lr=0.000750587, gnorm=0.298, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=6548 epoch 005: 356 / 1689 loss=3.978, nll_loss=2.468, ppl=5.53, wps=459462, ups=0.93, wpb=495062, bsz=16475.9, num_updates=7100, lr=0.000750587, gnorm=0.298, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=6548 epoch 005: 356 / 1689 loss=3.978, nll_loss=2.468, ppl=5.53, wps=459462, ups=0.93, wpb=495062, bsz=16475.9, num_updates=7100, lr=0.000750587, gnorm=0.298, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=6548 epoch 005: 456 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=557641, ups=1.12, wpb=496292, bsz=16665.3, num_updates=7200, lr=0.000745356, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6637 epoch 005: 456 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=557641, ups=1.12, wpb=496292, bsz=16665.3, num_updates=7200, lr=0.000745356, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6637 epoch 005: 456 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=557641, ups=1.12, wpb=496292, bsz=16665.3, num_updates=7200, lr=0.000745356, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6637 epoch 005: 456 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=557641, ups=1.12, wpb=496292, bsz=16665.3, num_updates=7200, lr=0.000745356, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6637 epoch 005: 456 / 1689 loss=3.966, nll_loss=2.454, ppl=5.48, wps=557641, ups=1.12, wpb=496292, bsz=16665.3, num_updates=7200, lr=0.000745356, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6637 epoch 005: 556 / 1689 loss=3.962, nll_loss=2.451, ppl=5.47, wps=561972, ups=1.13, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.286, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=6725 epoch 005: 556 / 1689 loss=3.962, nll_loss=2.451, ppl=5.47, wps=561972, ups=1.13, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.286, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=6725 epoch 005: 556 / 1689 loss=3.962, nll_loss=2.451, ppl=5.47, wps=561972, ups=1.13, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.286, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=6725 epoch 005: 556 / 1689 loss=3.962, nll_loss=2.451, ppl=5.47, wps=561972, ups=1.13, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.286, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=6725 epoch 005: 556 / 1689 loss=3.962, nll_loss=2.451, ppl=5.47, wps=561972, ups=1.13, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.286, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=6725 epoch 005: 656 / 1689 loss=3.969, nll_loss=2.458, ppl=5.49, wps=562935, ups=1.14, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.293, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=6813 epoch 005: 656 / 1689 loss=3.969, nll_loss=2.458, ppl=5.49, wps=562935, ups=1.14, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.293, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=6813 epoch 005: 656 / 1689 loss=3.969, nll_loss=2.458, ppl=5.49, wps=562935, ups=1.14, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.293, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=6813 epoch 005: 656 / 1689 loss=3.969, nll_loss=2.458, ppl=5.49, wps=562935, ups=1.14, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.293, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=6813 epoch 005: 656 / 1689 loss=3.969, nll_loss=2.458, ppl=5.49, wps=562935, ups=1.14, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.293, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=6813 epoch 005: 756 / 1689 loss=3.969, nll_loss=2.458, ppl=5.5, wps=562337, ups=1.14, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.302, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=6901 epoch 005: 756 / 1689 loss=3.969, nll_loss=2.458, ppl=5.5, wps=562337, ups=1.14, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.302, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=6901 epoch 005: 756 / 1689 loss=3.969, nll_loss=2.458, ppl=5.5, wps=562337, ups=1.14, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.302, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=6901 epoch 005: 756 / 1689 loss=3.969, nll_loss=2.458, ppl=5.5, wps=562337, ups=1.14, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.302, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=6901 epoch 005: 756 / 1689 loss=3.969, nll_loss=2.458, ppl=5.5, wps=562337, ups=1.14, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.302, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=6901 epoch 005: 857 / 1689 loss=3.961, nll_loss=2.45, ppl=5.46, wps=550124, ups=1.11, wpb=495167, bsz=16523.8, num_updates=7600, lr=0.000725476, gnorm=0.291, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=6991 epoch 005: 857 / 1689 loss=3.961, nll_loss=2.45, ppl=5.46, wps=550124, ups=1.11, wpb=495167, bsz=16523.8, num_updates=7600, lr=0.000725476, gnorm=0.291, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=6991 epoch 005: 857 / 1689 loss=3.961, nll_loss=2.45, ppl=5.46, wps=550124, ups=1.11, wpb=495167, bsz=16523.8, num_updates=7600, lr=0.000725476, gnorm=0.291, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=6991 epoch 005: 857 / 1689 loss=3.961, nll_loss=2.45, ppl=5.46, wps=550124, ups=1.11, wpb=495167, bsz=16523.8, num_updates=7600, lr=0.000725476, gnorm=0.291, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=6991 epoch 005: 857 / 1689 loss=3.961, nll_loss=2.45, ppl=5.46, wps=550124, ups=1.11, wpb=495167, bsz=16523.8, num_updates=7600, lr=0.000725476, gnorm=0.291, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=6991 epoch 005: 957 / 1689 loss=3.967, nll_loss=2.457, ppl=5.49, wps=562853, ups=1.13, wpb=496087, bsz=16488.3, num_updates=7700, lr=0.00072075, gnorm=0.292, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=7079 epoch 005: 957 / 1689 loss=3.967, nll_loss=2.457, ppl=5.49, wps=562853, ups=1.13, wpb=496087, bsz=16488.3, num_updates=7700, lr=0.00072075, gnorm=0.292, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=7079 epoch 005: 957 / 1689 loss=3.967, nll_loss=2.457, ppl=5.49, wps=562853, ups=1.13, wpb=496087, bsz=16488.3, num_updates=7700, lr=0.00072075, gnorm=0.292, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=7079 epoch 005: 957 / 1689 loss=3.967, nll_loss=2.457, ppl=5.49, wps=562853, ups=1.13, wpb=496087, bsz=16488.3, num_updates=7700, lr=0.00072075, gnorm=0.292, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=7079 epoch 005: 957 / 1689 loss=3.967, nll_loss=2.457, ppl=5.49, wps=562853, ups=1.13, wpb=496087, bsz=16488.3, num_updates=7700, lr=0.00072075, gnorm=0.292, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=7079 epoch 005: 1057 / 1689 loss=3.952, nll_loss=2.44, ppl=5.43, wps=557230, ups=1.12, wpb=497680, bsz=16646.3, num_updates=7800, lr=0.000716115, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=7168 epoch 005: 1057 / 1689 loss=3.952, nll_loss=2.44, ppl=5.43, wps=557230, ups=1.12, wpb=497680, bsz=16646.3, num_updates=7800, lr=0.000716115, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=7168 epoch 005: 1057 / 1689 loss=3.952, nll_loss=2.44, ppl=5.43, wps=557230, ups=1.12, wpb=497680, bsz=16646.3, num_updates=7800, lr=0.000716115, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=7168 epoch 005: 1057 / 1689 loss=3.952, nll_loss=2.44, ppl=5.43, wps=557230, ups=1.12, wpb=497680, bsz=16646.3, num_updates=7800, lr=0.000716115, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=7168 epoch 005: 1057 / 1689 loss=3.952, nll_loss=2.44, ppl=5.43, wps=557230, ups=1.12, wpb=497680, bsz=16646.3, num_updates=7800, lr=0.000716115, gnorm=0.281, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=7168 epoch 005: 1157 / 1689 loss=3.947, nll_loss=2.435, ppl=5.41, wps=554138, ups=1.12, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.289, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=7257 epoch 005: 1157 / 1689 loss=3.947, nll_loss=2.435, ppl=5.41, wps=554138, ups=1.12, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.289, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=7257 epoch 005: 1157 / 1689 loss=3.947, nll_loss=2.435, ppl=5.41, wps=554138, ups=1.12, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.289, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=7257 epoch 005: 1157 / 1689 loss=3.947, nll_loss=2.435, ppl=5.41, wps=554138, ups=1.12, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.289, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=7257 epoch 005: 1157 / 1689 loss=3.947, nll_loss=2.435, ppl=5.41, wps=554138, ups=1.12, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.289, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=7257 epoch 005: 1258 / 1689 loss=3.948, nll_loss=2.437, ppl=5.41, wps=542473, ups=1.1, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.286, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=7348 epoch 005: 1258 / 1689 loss=3.948, nll_loss=2.437, ppl=5.41, wps=542473, ups=1.1, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.286, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=7348 epoch 005: 1258 / 1689 loss=3.948, nll_loss=2.437, ppl=5.41, wps=542473, ups=1.1, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.286, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=7348 epoch 005: 1258 / 1689 loss=3.948, nll_loss=2.437, ppl=5.41, wps=542473, ups=1.1, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.286, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=7348 epoch 005: 1258 / 1689 loss=3.948, nll_loss=2.437, ppl=5.41, wps=542473, ups=1.1, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.286, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=7348 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.918 | nll_loss 2.35 | ppl 5.1 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.918 epoch 005 | valid on 'valid' subset | loss 3.918 | nll_loss 2.35 | ppl 5.1 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.918 epoch 005 | valid on 'valid' subset | loss 3.918 | nll_loss 2.35 | ppl 5.1 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.918 epoch 005 | valid on 'valid' subset | loss 3.918 | nll_loss 2.35 | ppl 5.1 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.918 epoch 005 | valid on 'valid' subset | loss 3.918 | nll_loss 2.35 | ppl 5.1 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.918 epoch 005: 1358 / 1689 loss=3.945, nll_loss=2.433, ppl=5.4, wps=456296, ups=0.92, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.293, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=7457 epoch 005: 1358 / 1689 loss=3.945, nll_loss=2.433, ppl=5.4, wps=456296, ups=0.92, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.293, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=7457 epoch 005: 1358 / 1689 loss=3.945, nll_loss=2.433, ppl=5.4, wps=456296, ups=0.92, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.293, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=7457 epoch 005: 1358 / 1689 loss=3.945, nll_loss=2.433, ppl=5.4, wps=456296, ups=0.92, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.293, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=7457 epoch 005: 1358 / 1689 loss=3.945, nll_loss=2.433, ppl=5.4, wps=456296, ups=0.92, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.293, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=7457 epoch 005: 1458 / 1689 loss=3.94, nll_loss=2.427, ppl=5.38, wps=556542, ups=1.12, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.279, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=7546 epoch 005: 1458 / 1689 loss=3.94, nll_loss=2.427, ppl=5.38, wps=556542, ups=1.12, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.279, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=7546 epoch 005: 1458 / 1689 loss=3.94, nll_loss=2.427, ppl=5.38, wps=556542, ups=1.12, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.279, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=7546 epoch 005: 1458 / 1689 loss=3.94, nll_loss=2.427, ppl=5.38, wps=556542, ups=1.12, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.279, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=7546 epoch 005: 1458 / 1689 loss=3.94, nll_loss=2.427, ppl=5.38, wps=556542, ups=1.12, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.279, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=7546 epoch 005: 1558 / 1689 loss=3.941, nll_loss=2.428, ppl=5.38, wps=554425, ups=1.12, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.274, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7635 epoch 005: 1558 / 1689 loss=3.941, nll_loss=2.428, ppl=5.38, wps=554425, ups=1.12, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.274, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7635 epoch 005: 1558 / 1689 loss=3.941, nll_loss=2.428, ppl=5.38, wps=554425, ups=1.12, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.274, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7635 epoch 005: 1558 / 1689 loss=3.941, nll_loss=2.428, ppl=5.38, wps=554425, ups=1.12, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.274, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7635 epoch 005: 1558 / 1689 loss=3.941, nll_loss=2.428, ppl=5.38, wps=554425, ups=1.12, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.274, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7635 epoch 005: 1658 / 1689 loss=3.936, nll_loss=2.424, ppl=5.37, wps=551046, ups=1.11, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.284, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=7725 epoch 005: 1658 / 1689 loss=3.936, nll_loss=2.424, ppl=5.37, wps=551046, ups=1.11, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.284, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=7725 epoch 005: 1658 / 1689 loss=3.936, nll_loss=2.424, ppl=5.37, wps=551046, ups=1.11, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.284, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=7725 epoch 005: 1658 / 1689 loss=3.936, nll_loss=2.424, ppl=5.37, wps=551046, ups=1.11, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.284, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=7725 epoch 005: 1658 / 1689 loss=3.936, nll_loss=2.424, ppl=5.37, wps=551046, ups=1.11, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.284, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=7725 end of epoch 5 (average epoch stats below) epoch 005 | loss 3.959 | nll_loss 2.447 | ppl 5.45 | wps 541830 | ups 1.09 | wpb 495116 | bsz 16501.7 | num_updates 8431 | lr 0.000688796 | gnorm 0.289 | clip 0 | loss_scale 2 | train_wall 1478 | gb_free 22.5 | wall 7752 epoch 005 | loss 3.959 | nll_loss 2.447 | ppl 5.45 | wps 541830 | ups 1.09 | wpb 495116 | bsz 16501.7 | num_updates 8431 | lr 0.000688796 | gnorm 0.289 | clip 0 | loss_scale 2 | train_wall 1478 | gb_free 22.5 | wall 7752 epoch 005 | loss 3.959 | nll_loss 2.447 | ppl 5.45 | wps 541830 | ups 1.09 | wpb 495116 | bsz 16501.7 | num_updates 8431 | lr 0.000688796 | gnorm 0.289 | clip 0 | loss_scale 2 | train_wall 1478 | gb_free 22.5 | wall 7752 epoch 005 | loss 3.959 | nll_loss 2.447 | ppl 5.45 | wps 541830 | ups 1.09 | wpb 495116 | bsz 16501.7 | num_updates 8431 | lr 0.000688796 | gnorm 0.289 | clip 0 | loss_scale 2 | train_wall 1478 | gb_free 22.5 | wall 7752 epoch 005 | loss 3.959 | nll_loss 2.447 | ppl 5.45 | wps 541830 | ups 1.09 | wpb 495116 | bsz 16501.7 | num_updates 8431 | lr 0.000688796 | gnorm 0.289 | clip 0 | loss_scale 2 | train_wall 1478 | gb_free 22.5 | wall 7752 Start iterating over samples epoch 006: 69 / 1689 loss=3.915, nll_loss=2.399, ppl=5.27, wps=550804, ups=1.12, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.278, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=7815 epoch 006: 69 / 1689 loss=3.915, nll_loss=2.399, ppl=5.27, wps=550804, ups=1.12, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.278, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=7815 epoch 006: 69 / 1689 loss=3.915, nll_loss=2.399, ppl=5.27, wps=550804, ups=1.12, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.278, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=7815 epoch 006: 69 / 1689 loss=3.915, nll_loss=2.399, ppl=5.27, wps=550804, ups=1.12, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.278, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=7815 epoch 006: 69 / 1689 loss=3.915, nll_loss=2.399, ppl=5.27, wps=550804, ups=1.12, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.278, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=7815 epoch 006: 69 / 1689 loss=3.915, nll_loss=2.399, ppl=5.27, wps=550804, ups=1.12, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.278, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=7815 epoch 006: 169 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=556642, ups=1.12, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.289, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=7904 epoch 006: 169 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=556642, ups=1.12, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.289, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=7904 epoch 006: 169 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=556642, ups=1.12, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.289, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=7904 epoch 006: 169 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=556642, ups=1.12, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.289, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=7904 epoch 006: 169 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=556642, ups=1.12, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.289, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=7904 epoch 006: 169 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=556642, ups=1.12, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.289, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=7904 epoch 006: 269 / 1689 loss=3.904, nll_loss=2.387, ppl=5.23, wps=557903, ups=1.12, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.269, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=7993 epoch 006: 269 / 1689 loss=3.904, nll_loss=2.387, ppl=5.23, wps=557903, ups=1.12, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.269, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=7993 epoch 006: 269 / 1689 loss=3.904, nll_loss=2.387, ppl=5.23, wps=557903, ups=1.12, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.269, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=7993 epoch 006: 269 / 1689 loss=3.904, nll_loss=2.387, ppl=5.23, wps=557903, ups=1.12, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.269, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=7993 epoch 006: 269 / 1689 loss=3.904, nll_loss=2.387, ppl=5.23, wps=557903, ups=1.12, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.269, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=7993 epoch 006: 269 / 1689 loss=3.904, nll_loss=2.387, ppl=5.23, wps=557903, ups=1.12, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.269, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=7993 epoch 006: 369 / 1689 loss=3.913, nll_loss=2.397, ppl=5.27, wps=551814, ups=1.12, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8082 epoch 006: 369 / 1689 loss=3.913, nll_loss=2.397, ppl=5.27, wps=551814, ups=1.12, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8082 epoch 006: 369 / 1689 loss=3.913, nll_loss=2.397, ppl=5.27, wps=551814, ups=1.12, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8082 epoch 006: 369 / 1689 loss=3.913, nll_loss=2.397, ppl=5.27, wps=551814, ups=1.12, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8082 epoch 006: 369 / 1689 loss=3.913, nll_loss=2.397, ppl=5.27, wps=551814, ups=1.12, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8082 epoch 006: 369 / 1689 loss=3.913, nll_loss=2.397, ppl=5.27, wps=551814, ups=1.12, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.282, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8082 epoch 006: 469 / 1689 loss=3.905, nll_loss=2.388, ppl=5.24, wps=552236, ups=1.11, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=8172 epoch 006: 469 / 1689 loss=3.905, nll_loss=2.388, ppl=5.24, wps=552236, ups=1.11, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=8172 epoch 006: 469 / 1689 loss=3.905, nll_loss=2.388, ppl=5.24, wps=552236, ups=1.11, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=8172 epoch 006: 469 / 1689 loss=3.905, nll_loss=2.388, ppl=5.24, wps=552236, ups=1.11, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=8172 epoch 006: 469 / 1689 loss=3.905, nll_loss=2.388, ppl=5.24, wps=552236, ups=1.11, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=8172 epoch 006: 469 / 1689 loss=3.905, nll_loss=2.388, ppl=5.24, wps=552236, ups=1.11, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=8172 epoch 006: 570 / 1689 loss=3.907, nll_loss=2.391, ppl=5.25, wps=549780, ups=1.11, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.275, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=8262 epoch 006: 570 / 1689 loss=3.907, nll_loss=2.391, ppl=5.25, wps=549780, ups=1.11, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.275, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=8262 epoch 006: 570 / 1689 loss=3.907, nll_loss=2.391, ppl=5.25, wps=549780, ups=1.11, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.275, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=8262 epoch 006: 570 / 1689 loss=3.907, nll_loss=2.391, ppl=5.25, wps=549780, ups=1.11, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.275, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=8262 epoch 006: 570 / 1689 loss=3.907, nll_loss=2.391, ppl=5.25, wps=549780, ups=1.11, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.275, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=8262 epoch 006: 570 / 1689 loss=3.907, nll_loss=2.391, ppl=5.25, wps=549780, ups=1.11, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.275, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=8262 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.899 | nll_loss 2.329 | ppl 5.02 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.899 epoch 006 | valid on 'valid' subset | loss 3.899 | nll_loss 2.329 | ppl 5.02 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.899 epoch 006 | valid on 'valid' subset | loss 3.899 | nll_loss 2.329 | ppl 5.02 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.899 epoch 006 | valid on 'valid' subset | loss 3.899 | nll_loss 2.329 | ppl 5.02 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.899 epoch 006 | valid on 'valid' subset | loss 3.899 | nll_loss 2.329 | ppl 5.02 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.899 epoch 006 | valid on 'valid' subset | loss 3.899 | nll_loss 2.329 | ppl 5.02 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.899 epoch 006: 670 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=460130, ups=0.93, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.28, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=8370 epoch 006: 670 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=460130, ups=0.93, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.28, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=8370 epoch 006: 670 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=460130, ups=0.93, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.28, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=8370 epoch 006: 670 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=460130, ups=0.93, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.28, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=8370 epoch 006: 670 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=460130, ups=0.93, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.28, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=8370 epoch 006: 670 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=460130, ups=0.93, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.28, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=8370 epoch 006: 770 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=552413, ups=1.11, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.27, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=8459 epoch 006: 770 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=552413, ups=1.11, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.27, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=8459 epoch 006: 770 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=552413, ups=1.11, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.27, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=8459 epoch 006: 770 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=552413, ups=1.11, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.27, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=8459 epoch 006: 770 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=552413, ups=1.11, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.27, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=8459 epoch 006: 770 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=552413, ups=1.11, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.27, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=8459 epoch 006: 870 / 1689 loss=3.906, nll_loss=2.391, ppl=5.24, wps=553237, ups=1.12, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=8549 epoch 006: 870 / 1689 loss=3.906, nll_loss=2.391, ppl=5.24, wps=553237, ups=1.12, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=8549 epoch 006: 870 / 1689 loss=3.906, nll_loss=2.391, ppl=5.24, wps=553237, ups=1.12, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=8549 epoch 006: 870 / 1689 loss=3.906, nll_loss=2.391, ppl=5.24, wps=553237, ups=1.12, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=8549 epoch 006: 870 / 1689 loss=3.906, nll_loss=2.391, ppl=5.24, wps=553237, ups=1.12, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=8549 epoch 006: 870 / 1689 loss=3.906, nll_loss=2.391, ppl=5.24, wps=553237, ups=1.12, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=8549 epoch 006: 970 / 1689 loss=3.908, nll_loss=2.392, ppl=5.25, wps=557148, ups=1.13, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.275, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=8637 epoch 006: 970 / 1689 loss=3.908, nll_loss=2.392, ppl=5.25, wps=557148, ups=1.13, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.275, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=8637 epoch 006: 970 / 1689 loss=3.908, nll_loss=2.392, ppl=5.25, wps=557148, ups=1.13, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.275, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=8637 epoch 006: 970 / 1689 loss=3.908, nll_loss=2.392, ppl=5.25, wps=557148, ups=1.13, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.275, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=8637 epoch 006: 970 / 1689 loss=3.908, nll_loss=2.392, ppl=5.25, wps=557148, ups=1.13, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.275, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=8637 epoch 006: 970 / 1689 loss=3.908, nll_loss=2.392, ppl=5.25, wps=557148, ups=1.13, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.275, clip=0, loss_scale=4, train_wall=87, gb_free=22.1, wall=8637 epoch 006: 1070 / 1689 loss=3.9, nll_loss=2.384, ppl=5.22, wps=554004, ups=1.12, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=8727 epoch 006: 1070 / 1689 loss=3.9, nll_loss=2.384, ppl=5.22, wps=554004, ups=1.12, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=8727 epoch 006: 1070 / 1689 loss=3.9, nll_loss=2.384, ppl=5.22, wps=554004, ups=1.12, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=8727 epoch 006: 1070 / 1689 loss=3.9, nll_loss=2.384, ppl=5.22, wps=554004, ups=1.12, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=8727 epoch 006: 1070 / 1689 loss=3.9, nll_loss=2.384, ppl=5.22, wps=554004, ups=1.12, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=8727 epoch 006: 1070 / 1689 loss=3.9, nll_loss=2.384, ppl=5.22, wps=554004, ups=1.12, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.272, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=8727 epoch 006: 1171 / 1689 loss=3.898, nll_loss=2.382, ppl=5.21, wps=546022, ups=1.1, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.272, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=8817 epoch 006: 1171 / 1689 loss=3.898, nll_loss=2.382, ppl=5.21, wps=546022, ups=1.1, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.272, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=8817 epoch 006: 1171 / 1689 loss=3.898, nll_loss=2.382, ppl=5.21, wps=546022, ups=1.1, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.272, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=8817 epoch 006: 1171 / 1689 loss=3.898, nll_loss=2.382, ppl=5.21, wps=546022, ups=1.1, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.272, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=8817 epoch 006: 1171 / 1689 loss=3.898, nll_loss=2.382, ppl=5.21, wps=546022, ups=1.1, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.272, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=8817 epoch 006: 1171 / 1689 loss=3.898, nll_loss=2.382, ppl=5.21, wps=546022, ups=1.1, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.272, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=8817 epoch 006: 1271 / 1689 loss=3.902, nll_loss=2.387, ppl=5.23, wps=553820, ups=1.12, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8907 epoch 006: 1271 / 1689 loss=3.902, nll_loss=2.387, ppl=5.23, wps=553820, ups=1.12, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8907 epoch 006: 1271 / 1689 loss=3.902, nll_loss=2.387, ppl=5.23, wps=553820, ups=1.12, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8907 epoch 006: 1271 / 1689 loss=3.902, nll_loss=2.387, ppl=5.23, wps=553820, ups=1.12, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8907 epoch 006: 1271 / 1689 loss=3.902, nll_loss=2.387, ppl=5.23, wps=553820, ups=1.12, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8907 epoch 006: 1271 / 1689 loss=3.902, nll_loss=2.387, ppl=5.23, wps=553820, ups=1.12, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.275, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=8907 epoch 006: 1371 / 1689 loss=3.893, nll_loss=2.377, ppl=5.19, wps=555092, ups=1.12, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.273, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=8996 epoch 006: 1371 / 1689 loss=3.893, nll_loss=2.377, ppl=5.19, wps=555092, ups=1.12, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.273, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=8996 epoch 006: 1371 / 1689 loss=3.893, nll_loss=2.377, ppl=5.19, wps=555092, ups=1.12, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.273, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=8996 epoch 006: 1371 / 1689 loss=3.893, nll_loss=2.377, ppl=5.19, wps=555092, ups=1.12, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.273, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=8996 epoch 006: 1371 / 1689 loss=3.893, nll_loss=2.377, ppl=5.19, wps=555092, ups=1.12, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.273, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=8996 epoch 006: 1371 / 1689 loss=3.893, nll_loss=2.377, ppl=5.19, wps=555092, ups=1.12, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.273, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=8996 epoch 006: 1471 / 1689 loss=3.899, nll_loss=2.383, ppl=5.22, wps=557064, ups=1.13, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.264, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=9085 epoch 006: 1471 / 1689 loss=3.899, nll_loss=2.383, ppl=5.22, wps=557064, ups=1.13, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.264, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=9085 epoch 006: 1471 / 1689 loss=3.899, nll_loss=2.383, ppl=5.22, wps=557064, ups=1.13, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.264, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=9085 epoch 006: 1471 / 1689 loss=3.899, nll_loss=2.383, ppl=5.22, wps=557064, ups=1.13, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.264, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=9085 epoch 006: 1471 / 1689 loss=3.899, nll_loss=2.383, ppl=5.22, wps=557064, ups=1.13, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.264, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=9085 epoch 006: 1471 / 1689 loss=3.899, nll_loss=2.383, ppl=5.22, wps=557064, ups=1.13, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.264, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=9085 epoch 006: 1571 / 1689 loss=3.887, nll_loss=2.371, ppl=5.17, wps=553536, ups=1.12, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.262, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=9175 epoch 006: 1571 / 1689 loss=3.887, nll_loss=2.371, ppl=5.17, wps=553536, ups=1.12, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.262, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=9175 epoch 006: 1571 / 1689 loss=3.887, nll_loss=2.371, ppl=5.17, wps=553536, ups=1.12, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.262, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=9175 epoch 006: 1571 / 1689 loss=3.887, nll_loss=2.371, ppl=5.17, wps=553536, ups=1.12, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.262, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=9175 epoch 006: 1571 / 1689 loss=3.887, nll_loss=2.371, ppl=5.17, wps=553536, ups=1.12, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.262, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=9175 epoch 006: 1571 / 1689 loss=3.887, nll_loss=2.371, ppl=5.17, wps=553536, ups=1.12, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.262, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=9175 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.855 | nll_loss 2.281 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.855 epoch 006 | valid on 'valid' subset | loss 3.855 | nll_loss 2.281 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.855 epoch 006 | valid on 'valid' subset | loss 3.855 | nll_loss 2.281 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.855 epoch 006 | valid on 'valid' subset | loss 3.855 | nll_loss 2.281 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.855 epoch 006 | valid on 'valid' subset | loss 3.855 | nll_loss 2.281 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.855 epoch 006 | valid on 'valid' subset | loss 3.855 | nll_loss 2.281 | ppl 4.86 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.855 epoch 006: 1672 / 1689 loss=3.889, nll_loss=2.373, ppl=5.18, wps=422530, ups=0.85, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.262, clip=0, loss_scale=4, train_wall=93, gb_free=21.9, wall=9292 epoch 006: 1672 / 1689 loss=3.889, nll_loss=2.373, ppl=5.18, wps=422530, ups=0.85, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.262, clip=0, loss_scale=4, train_wall=93, gb_free=21.9, wall=9292 epoch 006: 1672 / 1689 loss=3.889, nll_loss=2.373, ppl=5.18, wps=422530, ups=0.85, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.262, clip=0, loss_scale=4, train_wall=93, gb_free=21.9, wall=9292 epoch 006: 1672 / 1689 loss=3.889, nll_loss=2.373, ppl=5.18, wps=422530, ups=0.85, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.262, clip=0, loss_scale=4, train_wall=93, gb_free=21.9, wall=9292 epoch 006: 1672 / 1689 loss=3.889, nll_loss=2.373, ppl=5.18, wps=422530, ups=0.85, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.262, clip=0, loss_scale=4, train_wall=93, gb_free=21.9, wall=9292 epoch 006: 1672 / 1689 loss=3.889, nll_loss=2.373, ppl=5.18, wps=422530, ups=0.85, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.262, clip=0, loss_scale=4, train_wall=93, gb_free=21.9, wall=9292 end of epoch 6 (average epoch stats below) epoch 006 | loss 3.902 | nll_loss 2.386 | ppl 5.23 | wps 537122 | ups 1.08 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.275 | clip 0 | loss_scale 4 | train_wall 1489 | gb_free 23 | wall 9307 epoch 006 | loss 3.902 | nll_loss 2.386 | ppl 5.23 | wps 537122 | ups 1.08 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.275 | clip 0 | loss_scale 4 | train_wall 1489 | gb_free 23 | wall 9307 epoch 006 | loss 3.902 | nll_loss 2.386 | ppl 5.23 | wps 537122 | ups 1.08 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.275 | clip 0 | loss_scale 4 | train_wall 1489 | gb_free 23 | wall 9307 epoch 006 | loss 3.902 | nll_loss 2.386 | ppl 5.23 | wps 537122 | ups 1.08 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.275 | clip 0 | loss_scale 4 | train_wall 1489 | gb_free 23 | wall 9307 epoch 006 | loss 3.902 | nll_loss 2.386 | ppl 5.23 | wps 537122 | ups 1.08 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.275 | clip 0 | loss_scale 4 | train_wall 1489 | gb_free 23 | wall 9307 epoch 006 | loss 3.902 | nll_loss 2.386 | ppl 5.23 | wps 537122 | ups 1.08 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.275 | clip 0 | loss_scale 4 | train_wall 1489 | gb_free 23 | wall 9307 Start iterating over samples epoch 007: 83 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=551845, ups=1.12, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.277, clip=0, loss_scale=4, train_wall=87, gb_free=22.4, wall=9381 epoch 007: 83 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=551845, ups=1.12, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.277, clip=0, loss_scale=4, train_wall=87, gb_free=22.4, wall=9381 epoch 007: 83 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=551845, ups=1.12, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.277, clip=0, loss_scale=4, train_wall=87, gb_free=22.4, wall=9381 epoch 007: 83 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=551845, ups=1.12, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.277, clip=0, loss_scale=4, train_wall=87, gb_free=22.4, wall=9381 epoch 007: 83 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=551845, ups=1.12, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.277, clip=0, loss_scale=4, train_wall=87, gb_free=22.4, wall=9381 epoch 007: 83 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=551845, ups=1.12, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.277, clip=0, loss_scale=4, train_wall=87, gb_free=22.4, wall=9381 epoch 007: 83 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=551845, ups=1.12, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.277, clip=0, loss_scale=4, train_wall=87, gb_free=22.4, wall=9381 epoch 007: 183 / 1689 loss=3.857, nll_loss=2.337, ppl=5.05, wps=553350, ups=1.12, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.257, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=9471 epoch 007: 183 / 1689 loss=3.857, nll_loss=2.337, ppl=5.05, wps=553350, ups=1.12, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.257, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=9471 epoch 007: 183 / 1689 loss=3.857, nll_loss=2.337, ppl=5.05, wps=553350, ups=1.12, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.257, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=9471 epoch 007: 183 / 1689 loss=3.857, nll_loss=2.337, ppl=5.05, wps=553350, ups=1.12, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.257, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=9471 epoch 007: 183 / 1689 loss=3.857, nll_loss=2.337, ppl=5.05, wps=553350, ups=1.12, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.257, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=9471 epoch 007: 183 / 1689 loss=3.857, nll_loss=2.337, ppl=5.05, wps=553350, ups=1.12, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.257, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=9471 epoch 007: 183 / 1689 loss=3.857, nll_loss=2.337, ppl=5.05, wps=553350, ups=1.12, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.257, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=9471 epoch 007: 283 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=556781, ups=1.12, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=22.8, wall=9560 epoch 007: 283 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=556781, ups=1.12, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=22.8, wall=9560 epoch 007: 283 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=556781, ups=1.12, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=22.8, wall=9560 epoch 007: 283 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=556781, ups=1.12, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=22.8, wall=9560 epoch 007: 283 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=556781, ups=1.12, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=22.8, wall=9560 epoch 007: 283 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=556781, ups=1.12, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=22.8, wall=9560 epoch 007: 283 / 1689 loss=3.865, nll_loss=2.345, ppl=5.08, wps=556781, ups=1.12, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=22.8, wall=9560 epoch 007: 383 / 1689 loss=3.869, nll_loss=2.35, ppl=5.1, wps=557997, ups=1.13, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=9648 epoch 007: 383 / 1689 loss=3.869, nll_loss=2.35, ppl=5.1, wps=557997, ups=1.13, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=9648 epoch 007: 383 / 1689 loss=3.869, nll_loss=2.35, ppl=5.1, wps=557997, ups=1.13, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=9648 epoch 007: 383 / 1689 loss=3.869, nll_loss=2.35, ppl=5.1, wps=557997, ups=1.13, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=9648 epoch 007: 383 / 1689 loss=3.869, nll_loss=2.35, ppl=5.1, wps=557997, ups=1.13, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=9648 epoch 007: 383 / 1689 loss=3.869, nll_loss=2.35, ppl=5.1, wps=557997, ups=1.13, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=9648 epoch 007: 383 / 1689 loss=3.869, nll_loss=2.35, ppl=5.1, wps=557997, ups=1.13, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.271, clip=0, loss_scale=4, train_wall=87, gb_free=21.7, wall=9648 epoch 007: 484 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=546208, ups=1.1, wpb=495457, bsz=16530.1, num_updates=10600, lr=0.000614295, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=9739 epoch 007: 484 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=546208, ups=1.1, wpb=495457, bsz=16530.1, num_updates=10600, lr=0.000614295, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=9739 epoch 007: 484 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=546208, ups=1.1, wpb=495457, bsz=16530.1, num_updates=10600, lr=0.000614295, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=9739 epoch 007: 484 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=546208, ups=1.1, wpb=495457, bsz=16530.1, num_updates=10600, lr=0.000614295, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=9739 epoch 007: 484 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=546208, ups=1.1, wpb=495457, bsz=16530.1, num_updates=10600, lr=0.000614295, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=9739 epoch 007: 484 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=546208, ups=1.1, wpb=495457, bsz=16530.1, num_updates=10600, lr=0.000614295, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=9739 epoch 007: 484 / 1689 loss=3.871, nll_loss=2.352, ppl=5.11, wps=546208, ups=1.1, wpb=495457, bsz=16530.1, num_updates=10600, lr=0.000614295, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=9739 epoch 007: 584 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556314, ups=1.12, wpb=495890, bsz=16459, num_updates=10700, lr=0.000611418, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=9828 epoch 007: 584 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556314, ups=1.12, wpb=495890, bsz=16459, num_updates=10700, lr=0.000611418, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=9828 epoch 007: 584 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556314, ups=1.12, wpb=495890, bsz=16459, num_updates=10700, lr=0.000611418, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=9828 epoch 007: 584 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556314, ups=1.12, wpb=495890, bsz=16459, num_updates=10700, lr=0.000611418, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=9828 epoch 007: 584 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556314, ups=1.12, wpb=495890, bsz=16459, num_updates=10700, lr=0.000611418, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=9828 epoch 007: 584 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556314, ups=1.12, wpb=495890, bsz=16459, num_updates=10700, lr=0.000611418, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=9828 epoch 007: 584 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556314, ups=1.12, wpb=495890, bsz=16459, num_updates=10700, lr=0.000611418, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=9828 epoch 007: 684 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=555599, ups=1.12, wpb=494674, bsz=16339.8, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9917 epoch 007: 684 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=555599, ups=1.12, wpb=494674, bsz=16339.8, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9917 epoch 007: 684 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=555599, ups=1.12, wpb=494674, bsz=16339.8, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9917 epoch 007: 684 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=555599, ups=1.12, wpb=494674, bsz=16339.8, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9917 epoch 007: 684 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=555599, ups=1.12, wpb=494674, bsz=16339.8, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9917 epoch 007: 684 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=555599, ups=1.12, wpb=494674, bsz=16339.8, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9917 epoch 007: 684 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=555599, ups=1.12, wpb=494674, bsz=16339.8, num_updates=10800, lr=0.000608581, gnorm=0.26, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9917 epoch 007: 784 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=554413, ups=1.12, wpb=494281, bsz=16257.7, num_updates=10900, lr=0.000605783, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=10006 epoch 007: 784 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=554413, ups=1.12, wpb=494281, bsz=16257.7, num_updates=10900, lr=0.000605783, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=10006 epoch 007: 784 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=554413, ups=1.12, wpb=494281, bsz=16257.7, num_updates=10900, lr=0.000605783, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=10006 epoch 007: 784 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=554413, ups=1.12, wpb=494281, bsz=16257.7, num_updates=10900, lr=0.000605783, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=10006 epoch 007: 784 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=554413, ups=1.12, wpb=494281, bsz=16257.7, num_updates=10900, lr=0.000605783, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=10006 epoch 007: 784 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=554413, ups=1.12, wpb=494281, bsz=16257.7, num_updates=10900, lr=0.000605783, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=10006 epoch 007: 784 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=554413, ups=1.12, wpb=494281, bsz=16257.7, num_updates=10900, lr=0.000605783, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=10006 epoch 007: 884 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=558485, ups=1.13, wpb=494664, bsz=16437.2, num_updates=11000, lr=0.000603023, gnorm=0.268, clip=0, loss_scale=4, train_wall=87, gb_free=21.5, wall=10095 epoch 007: 884 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=558485, ups=1.13, wpb=494664, bsz=16437.2, num_updates=11000, lr=0.000603023, gnorm=0.268, clip=0, loss_scale=4, train_wall=87, gb_free=21.5, wall=10095 epoch 007: 884 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=558485, ups=1.13, wpb=494664, bsz=16437.2, num_updates=11000, lr=0.000603023, gnorm=0.268, clip=0, loss_scale=4, train_wall=87, gb_free=21.5, wall=10095 epoch 007: 884 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=558485, ups=1.13, wpb=494664, bsz=16437.2, num_updates=11000, lr=0.000603023, gnorm=0.268, clip=0, loss_scale=4, train_wall=87, gb_free=21.5, wall=10095 epoch 007: 884 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=558485, ups=1.13, wpb=494664, bsz=16437.2, num_updates=11000, lr=0.000603023, gnorm=0.268, clip=0, loss_scale=4, train_wall=87, gb_free=21.5, wall=10095 epoch 007: 884 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=558485, ups=1.13, wpb=494664, bsz=16437.2, num_updates=11000, lr=0.000603023, gnorm=0.268, clip=0, loss_scale=4, train_wall=87, gb_free=21.5, wall=10095 epoch 007: 884 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=558485, ups=1.13, wpb=494664, bsz=16437.2, num_updates=11000, lr=0.000603023, gnorm=0.268, clip=0, loss_scale=4, train_wall=87, gb_free=21.5, wall=10095 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 3.854 | nll_loss 2.286 | ppl 4.88 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.854 epoch 007 | valid on 'valid' subset | loss 3.854 | nll_loss 2.286 | ppl 4.88 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.854 epoch 007 | valid on 'valid' subset | loss 3.854 | nll_loss 2.286 | ppl 4.88 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.854 epoch 007 | valid on 'valid' subset | loss 3.854 | nll_loss 2.286 | ppl 4.88 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.854 epoch 007 | valid on 'valid' subset | loss 3.854 | nll_loss 2.286 | ppl 4.88 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.854 epoch 007 | valid on 'valid' subset | loss 3.854 | nll_loss 2.286 | ppl 4.88 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.854 epoch 007 | valid on 'valid' subset | loss 3.854 | nll_loss 2.286 | ppl 4.88 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.854 epoch 007: 984 / 1689 loss=3.861, nll_loss=2.342, ppl=5.07, wps=350272, ups=0.71, wpb=496759, bsz=16244.9, num_updates=11100, lr=0.0006003, gnorm=0.255, clip=0, loss_scale=8, train_wall=116, gb_free=22.2, wall=10237 epoch 007: 984 / 1689 loss=3.861, nll_loss=2.342, ppl=5.07, wps=350272, ups=0.71, wpb=496759, bsz=16244.9, num_updates=11100, lr=0.0006003, gnorm=0.255, clip=0, loss_scale=8, train_wall=116, gb_free=22.2, wall=10237 epoch 007: 984 / 1689 loss=3.861, nll_loss=2.342, ppl=5.07, wps=350272, ups=0.71, wpb=496759, bsz=16244.9, num_updates=11100, lr=0.0006003, gnorm=0.255, clip=0, loss_scale=8, train_wall=116, gb_free=22.2, wall=10237 epoch 007: 984 / 1689 loss=3.861, nll_loss=2.342, ppl=5.07, wps=350272, ups=0.71, wpb=496759, bsz=16244.9, num_updates=11100, lr=0.0006003, gnorm=0.255, clip=0, loss_scale=8, train_wall=116, gb_free=22.2, wall=10237 epoch 007: 984 / 1689 loss=3.861, nll_loss=2.342, ppl=5.07, wps=350272, ups=0.71, wpb=496759, bsz=16244.9, num_updates=11100, lr=0.0006003, gnorm=0.255, clip=0, loss_scale=8, train_wall=116, gb_free=22.2, wall=10237 epoch 007: 984 / 1689 loss=3.861, nll_loss=2.342, ppl=5.07, wps=350272, ups=0.71, wpb=496759, bsz=16244.9, num_updates=11100, lr=0.0006003, gnorm=0.255, clip=0, loss_scale=8, train_wall=116, gb_free=22.2, wall=10237 epoch 007: 984 / 1689 loss=3.861, nll_loss=2.342, ppl=5.07, wps=350272, ups=0.71, wpb=496759, bsz=16244.9, num_updates=11100, lr=0.0006003, gnorm=0.255, clip=0, loss_scale=8, train_wall=116, gb_free=22.2, wall=10237 epoch 007: 1085 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=546135, ups=1.1, wpb=496149, bsz=16898.5, num_updates=11200, lr=0.000597614, gnorm=0.257, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=10328 epoch 007: 1085 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=546135, ups=1.1, wpb=496149, bsz=16898.5, num_updates=11200, lr=0.000597614, gnorm=0.257, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=10328 epoch 007: 1085 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=546135, ups=1.1, wpb=496149, bsz=16898.5, num_updates=11200, lr=0.000597614, gnorm=0.257, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=10328 epoch 007: 1085 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=546135, ups=1.1, wpb=496149, bsz=16898.5, num_updates=11200, lr=0.000597614, gnorm=0.257, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=10328 epoch 007: 1085 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=546135, ups=1.1, wpb=496149, bsz=16898.5, num_updates=11200, lr=0.000597614, gnorm=0.257, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=10328 epoch 007: 1085 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=546135, ups=1.1, wpb=496149, bsz=16898.5, num_updates=11200, lr=0.000597614, gnorm=0.257, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=10328 epoch 007: 1085 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=546135, ups=1.1, wpb=496149, bsz=16898.5, num_updates=11200, lr=0.000597614, gnorm=0.257, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=10328 epoch 007: 1186 / 1689 loss=3.86, nll_loss=2.341, ppl=5.07, wps=553914, ups=1.12, wpb=495926, bsz=16681, num_updates=11300, lr=0.000594964, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=10417 epoch 007: 1186 / 1689 loss=3.86, nll_loss=2.341, ppl=5.07, wps=553914, ups=1.12, wpb=495926, bsz=16681, num_updates=11300, lr=0.000594964, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=10417 epoch 007: 1186 / 1689 loss=3.86, nll_loss=2.341, ppl=5.07, wps=553914, ups=1.12, wpb=495926, bsz=16681, num_updates=11300, lr=0.000594964, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=10417 epoch 007: 1186 / 1689 loss=3.86, nll_loss=2.341, ppl=5.07, wps=553914, ups=1.12, wpb=495926, bsz=16681, num_updates=11300, lr=0.000594964, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=10417 epoch 007: 1186 / 1689 loss=3.86, nll_loss=2.341, ppl=5.07, wps=553914, ups=1.12, wpb=495926, bsz=16681, num_updates=11300, lr=0.000594964, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=10417 epoch 007: 1186 / 1689 loss=3.86, nll_loss=2.341, ppl=5.07, wps=553914, ups=1.12, wpb=495926, bsz=16681, num_updates=11300, lr=0.000594964, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=10417 epoch 007: 1186 / 1689 loss=3.86, nll_loss=2.341, ppl=5.07, wps=553914, ups=1.12, wpb=495926, bsz=16681, num_updates=11300, lr=0.000594964, gnorm=0.256, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=10417 epoch 007: 1286 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=560045, ups=1.13, wpb=495319, bsz=16445.3, num_updates=11400, lr=0.000592349, gnorm=0.252, clip=0, loss_scale=2, train_wall=87, gb_free=20.8, wall=10506 epoch 007: 1286 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=560045, ups=1.13, wpb=495319, bsz=16445.3, num_updates=11400, lr=0.000592349, gnorm=0.252, clip=0, loss_scale=2, train_wall=87, gb_free=20.8, wall=10506 epoch 007: 1286 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=560045, ups=1.13, wpb=495319, bsz=16445.3, num_updates=11400, lr=0.000592349, gnorm=0.252, clip=0, loss_scale=2, train_wall=87, gb_free=20.8, wall=10506 epoch 007: 1286 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=560045, ups=1.13, wpb=495319, bsz=16445.3, num_updates=11400, lr=0.000592349, gnorm=0.252, clip=0, loss_scale=2, train_wall=87, gb_free=20.8, wall=10506 epoch 007: 1286 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=560045, ups=1.13, wpb=495319, bsz=16445.3, num_updates=11400, lr=0.000592349, gnorm=0.252, clip=0, loss_scale=2, train_wall=87, gb_free=20.8, wall=10506 epoch 007: 1286 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=560045, ups=1.13, wpb=495319, bsz=16445.3, num_updates=11400, lr=0.000592349, gnorm=0.252, clip=0, loss_scale=2, train_wall=87, gb_free=20.8, wall=10506 epoch 007: 1286 / 1689 loss=3.862, nll_loss=2.343, ppl=5.07, wps=560045, ups=1.13, wpb=495319, bsz=16445.3, num_updates=11400, lr=0.000592349, gnorm=0.252, clip=0, loss_scale=2, train_wall=87, gb_free=20.8, wall=10506 epoch 007: 1386 / 1689 loss=3.866, nll_loss=2.348, ppl=5.09, wps=558957, ups=1.13, wpb=494917, bsz=16423.4, num_updates=11500, lr=0.000589768, gnorm=0.268, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=10594 epoch 007: 1386 / 1689 loss=3.866, nll_loss=2.348, ppl=5.09, wps=558957, ups=1.13, wpb=494917, bsz=16423.4, num_updates=11500, lr=0.000589768, gnorm=0.268, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=10594 epoch 007: 1386 / 1689 loss=3.866, nll_loss=2.348, ppl=5.09, wps=558957, ups=1.13, wpb=494917, bsz=16423.4, num_updates=11500, lr=0.000589768, gnorm=0.268, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=10594 epoch 007: 1386 / 1689 loss=3.866, nll_loss=2.348, ppl=5.09, wps=558957, ups=1.13, wpb=494917, bsz=16423.4, num_updates=11500, lr=0.000589768, gnorm=0.268, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=10594 epoch 007: 1386 / 1689 loss=3.866, nll_loss=2.348, ppl=5.09, wps=558957, ups=1.13, wpb=494917, bsz=16423.4, num_updates=11500, lr=0.000589768, gnorm=0.268, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=10594 epoch 007: 1386 / 1689 loss=3.866, nll_loss=2.348, ppl=5.09, wps=558957, ups=1.13, wpb=494917, bsz=16423.4, num_updates=11500, lr=0.000589768, gnorm=0.268, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=10594 epoch 007: 1386 / 1689 loss=3.866, nll_loss=2.348, ppl=5.09, wps=558957, ups=1.13, wpb=494917, bsz=16423.4, num_updates=11500, lr=0.000589768, gnorm=0.268, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=10594 epoch 007: 1486 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=559657, ups=1.13, wpb=496164, bsz=16664.4, num_updates=11600, lr=0.00058722, gnorm=0.241, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=10683 epoch 007: 1486 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=559657, ups=1.13, wpb=496164, bsz=16664.4, num_updates=11600, lr=0.00058722, gnorm=0.241, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=10683 epoch 007: 1486 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=559657, ups=1.13, wpb=496164, bsz=16664.4, num_updates=11600, lr=0.00058722, gnorm=0.241, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=10683 epoch 007: 1486 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=559657, ups=1.13, wpb=496164, bsz=16664.4, num_updates=11600, lr=0.00058722, gnorm=0.241, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=10683 epoch 007: 1486 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=559657, ups=1.13, wpb=496164, bsz=16664.4, num_updates=11600, lr=0.00058722, gnorm=0.241, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=10683 epoch 007: 1486 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=559657, ups=1.13, wpb=496164, bsz=16664.4, num_updates=11600, lr=0.00058722, gnorm=0.241, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=10683 epoch 007: 1486 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=559657, ups=1.13, wpb=496164, bsz=16664.4, num_updates=11600, lr=0.00058722, gnorm=0.241, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=10683 epoch 007: 1586 / 1689 loss=3.848, nll_loss=2.328, ppl=5.02, wps=554769, ups=1.12, wpb=495862, bsz=17005.7, num_updates=11700, lr=0.000584705, gnorm=0.244, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=10772 epoch 007: 1586 / 1689 loss=3.848, nll_loss=2.328, ppl=5.02, wps=554769, ups=1.12, wpb=495862, bsz=17005.7, num_updates=11700, lr=0.000584705, gnorm=0.244, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=10772 epoch 007: 1586 / 1689 loss=3.848, nll_loss=2.328, ppl=5.02, wps=554769, ups=1.12, wpb=495862, bsz=17005.7, num_updates=11700, lr=0.000584705, gnorm=0.244, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=10772 epoch 007: 1586 / 1689 loss=3.848, nll_loss=2.328, ppl=5.02, wps=554769, ups=1.12, wpb=495862, bsz=17005.7, num_updates=11700, lr=0.000584705, gnorm=0.244, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=10772 epoch 007: 1586 / 1689 loss=3.848, nll_loss=2.328, ppl=5.02, wps=554769, ups=1.12, wpb=495862, bsz=17005.7, num_updates=11700, lr=0.000584705, gnorm=0.244, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=10772 epoch 007: 1586 / 1689 loss=3.848, nll_loss=2.328, ppl=5.02, wps=554769, ups=1.12, wpb=495862, bsz=17005.7, num_updates=11700, lr=0.000584705, gnorm=0.244, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=10772 epoch 007: 1586 / 1689 loss=3.848, nll_loss=2.328, ppl=5.02, wps=554769, ups=1.12, wpb=495862, bsz=17005.7, num_updates=11700, lr=0.000584705, gnorm=0.244, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=10772 epoch 007: 1687 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=550253, ups=1.11, wpb=494492, bsz=16483.2, num_updates=11800, lr=0.000582223, gnorm=0.26, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=10862 epoch 007: 1687 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=550253, ups=1.11, wpb=494492, bsz=16483.2, num_updates=11800, lr=0.000582223, gnorm=0.26, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=10862 epoch 007: 1687 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=550253, ups=1.11, wpb=494492, bsz=16483.2, num_updates=11800, lr=0.000582223, gnorm=0.26, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=10862 epoch 007: 1687 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=550253, ups=1.11, wpb=494492, bsz=16483.2, num_updates=11800, lr=0.000582223, gnorm=0.26, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=10862 epoch 007: 1687 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=550253, ups=1.11, wpb=494492, bsz=16483.2, num_updates=11800, lr=0.000582223, gnorm=0.26, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=10862 epoch 007: 1687 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=550253, ups=1.11, wpb=494492, bsz=16483.2, num_updates=11800, lr=0.000582223, gnorm=0.26, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=10862 epoch 007: 1687 / 1689 loss=3.85, nll_loss=2.33, ppl=5.03, wps=550253, ups=1.11, wpb=494492, bsz=16483.2, num_updates=11800, lr=0.000582223, gnorm=0.26, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=10862 end of epoch 7 (average epoch stats below) epoch 007 | loss 3.862 | nll_loss 2.342 | ppl 5.07 | wps 535938 | ups 1.08 | wpb 495110 | bsz 16505.5 | num_updates 11802 | lr 0.000582173 | gnorm 0.259 | clip 0 | loss_scale 1 | train_wall 1506 | gb_free 23 | wall 10863 epoch 007 | loss 3.862 | nll_loss 2.342 | ppl 5.07 | wps 535938 | ups 1.08 | wpb 495110 | bsz 16505.5 | num_updates 11802 | lr 0.000582173 | gnorm 0.259 | clip 0 | loss_scale 1 | train_wall 1506 | gb_free 23 | wall 10863 epoch 007 | loss 3.862 | nll_loss 2.342 | ppl 5.07 | wps 535938 | ups 1.08 | wpb 495110 | bsz 16505.5 | num_updates 11802 | lr 0.000582173 | gnorm 0.259 | clip 0 | loss_scale 1 | train_wall 1506 | gb_free 23 | wall 10863 epoch 007 | loss 3.862 | nll_loss 2.342 | ppl 5.07 | wps 535938 | ups 1.08 | wpb 495110 | bsz 16505.5 | num_updates 11802 | lr 0.000582173 | gnorm 0.259 | clip 0 | loss_scale 1 | train_wall 1506 | gb_free 23 | wall 10863 epoch 007 | loss 3.862 | nll_loss 2.342 | ppl 5.07 | wps 535938 | ups 1.08 | wpb 495110 | bsz 16505.5 | num_updates 11802 | lr 0.000582173 | gnorm 0.259 | clip 0 | loss_scale 1 | train_wall 1506 | gb_free 23 | wall 10863 epoch 007 | loss 3.862 | nll_loss 2.342 | ppl 5.07 | wps 535938 | ups 1.08 | wpb 495110 | bsz 16505.5 | num_updates 11802 | lr 0.000582173 | gnorm 0.259 | clip 0 | loss_scale 1 | train_wall 1506 | gb_free 23 | wall 10863 epoch 007 | loss 3.862 | nll_loss 2.342 | ppl 5.07 | wps 535938 | ups 1.08 | wpb 495110 | bsz 16505.5 | num_updates 11802 | lr 0.000582173 | gnorm 0.259 | clip 0 | loss_scale 1 | train_wall 1506 | gb_free 23 | wall 10863 Start iterating over samples epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 98 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=396075, ups=0.81, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.262, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=10986 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 epoch 008: 198 / 1689 loss=3.826, nll_loss=2.303, ppl=4.94, wps=559224, ups=1.13, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.261, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=11075 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008 | valid on 'valid' subset | loss 3.835 | nll_loss 2.264 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.835 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 298 / 1689 loss=3.833, nll_loss=2.31, ppl=4.96, wps=461594, ups=0.93, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.253, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=11182 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 398 / 1689 loss=3.826, nll_loss=2.303, ppl=4.93, wps=559886, ups=1.13, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.247, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=11270 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 498 / 1689 loss=3.838, nll_loss=2.317, ppl=4.98, wps=561408, ups=1.14, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.253, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=11358 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 598 / 1689 loss=3.841, nll_loss=2.32, ppl=4.99, wps=554599, ups=1.12, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.238, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=11448 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 698 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=551953, ups=1.11, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.265, clip=0, loss_scale=2, train_wall=88, gb_free=20.7, wall=11537 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 798 / 1689 loss=3.835, nll_loss=2.313, ppl=4.97, wps=550260, ups=1.11, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.246, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=11627 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 898 / 1689 loss=3.834, nll_loss=2.313, ppl=4.97, wps=557524, ups=1.12, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.251, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=11716 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 998 / 1689 loss=3.836, nll_loss=2.315, ppl=4.98, wps=551874, ups=1.12, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.25, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=11806 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1098 / 1689 loss=3.825, nll_loss=2.302, ppl=4.93, wps=552530, ups=1.12, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=11896 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 epoch 008: 1198 / 1689 loss=3.825, nll_loss=2.303, ppl=4.94, wps=556283, ups=1.12, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.244, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=11985 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008 | valid on 'valid' subset | loss 3.818 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.818 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1298 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=363632, ups=0.73, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.238, clip=0, loss_scale=4, train_wall=86, gb_free=21.9, wall=12121 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1398 / 1689 loss=3.827, nll_loss=2.306, ppl=4.94, wps=570036, ups=1.15, wpb=496428, bsz=16568.3, num_updates=13200, lr=0.000550482, gnorm=0.248, clip=0, loss_scale=4, train_wall=87, gb_free=21.9, wall=12208 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1499 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=554369, ups=1.12, wpb=495192, bsz=16502.5, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=12298 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 epoch 008: 1599 / 1689 loss=3.831, nll_loss=2.31, ppl=4.96, wps=557784, ups=1.13, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=12386 end of epoch 8 (average epoch stats below) epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 epoch 008 | loss 3.83 | nll_loss 2.309 | ppl 4.95 | wps 521434 | ups 1.05 | wpb 495117 | bsz 16509 | num_updates 13490 | lr 0.000544533 | gnorm 0.249 | clip 0 | loss_scale 4 | train_wall 1481 | gb_free 22.7 | wall 12466 Start iterating over samples epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 10 / 1689 loss=3.828, nll_loss=2.306, ppl=4.95, wps=542472, ups=1.1, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.253, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12477 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 110 / 1689 loss=3.801, nll_loss=2.276, ppl=4.84, wps=558350, ups=1.13, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.24, clip=0, loss_scale=4, train_wall=87, gb_free=22.5, wall=12566 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 210 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=557073, ups=1.13, wpb=493508, bsz=16709.4, num_updates=13700, lr=0.000540343, gnorm=0.238, clip=0, loss_scale=4, train_wall=87, gb_free=21.6, wall=12654 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 310 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=555713, ups=1.12, wpb=497118, bsz=16555.1, num_updates=13800, lr=0.000538382, gnorm=0.229, clip=0, loss_scale=8, train_wall=88, gb_free=21.5, wall=12744 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 411 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=553638, ups=1.12, wpb=495176, bsz=16151, num_updates=13900, lr=0.000536442, gnorm=0.239, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=12833 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 epoch 009: 511 / 1689 loss=3.804, nll_loss=2.279, ppl=4.86, wps=556208, ups=1.12, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.234, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=12922 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009 | valid on 'valid' subset | loss 3.818 | nll_loss 2.251 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.818 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 611 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=461005, ups=0.93, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.243, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=13030 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 712 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=557668, ups=1.13, wpb=495619, bsz=16474, num_updates=14200, lr=0.000530745, gnorm=0.237, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=13119 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 812 / 1689 loss=3.808, nll_loss=2.284, ppl=4.87, wps=556105, ups=1.12, wpb=494789, bsz=16667.7, num_updates=14300, lr=0.000528886, gnorm=0.239, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=13208 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 912 / 1689 loss=3.811, nll_loss=2.288, ppl=4.89, wps=551061, ups=1.11, wpb=495399, bsz=16653.6, num_updates=14400, lr=0.000527046, gnorm=0.241, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=13298 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1012 / 1689 loss=3.807, nll_loss=2.283, ppl=4.87, wps=554596, ups=1.12, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.23, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=13387 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1112 / 1689 loss=3.811, nll_loss=2.288, ppl=4.88, wps=550585, ups=1.11, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=13477 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1212 / 1689 loss=3.804, nll_loss=2.28, ppl=4.86, wps=551773, ups=1.11, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.237, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=13566 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1312 / 1689 loss=3.805, nll_loss=2.281, ppl=4.86, wps=555484, ups=1.12, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.235, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13656 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1412 / 1689 loss=3.801, nll_loss=2.277, ppl=4.85, wps=555721, ups=1.12, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22.2, wall=13745 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 epoch 009: 1512 / 1689 loss=3.81, nll_loss=2.287, ppl=4.88, wps=555976, ups=1.12, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13834 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009 | valid on 'valid' subset | loss 3.8 | nll_loss 2.233 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.8 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 epoch 009: 1612 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=454575, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.243, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=13943 end of epoch 9 (average epoch stats below) epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 epoch 009 | loss 3.805 | nll_loss 2.281 | ppl 4.86 | wps 540309 | ups 1.09 | wpb 495132 | bsz 16499.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.237 | clip 0 | loss_scale 4 | train_wall 1478 | gb_free 25.4 | wall 14011 Start iterating over samples epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 24 / 1689 loss=3.803, nll_loss=2.279, ppl=4.85, wps=545981, ups=1.11, wpb=491710, bsz=16559.6, num_updates=15200, lr=0.000512989, gnorm=0.228, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=14033 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 125 / 1689 loss=3.771, nll_loss=2.243, ppl=4.73, wps=542819, ups=1.1, wpb=495390, bsz=16394.8, num_updates=15300, lr=0.00051131, gnorm=0.235, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=14124 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 225 / 1689 loss=3.785, nll_loss=2.258, ppl=4.78, wps=551728, ups=1.11, wpb=495444, bsz=16735.6, num_updates=15400, lr=0.000509647, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14214 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 325 / 1689 loss=3.791, nll_loss=2.266, ppl=4.81, wps=554342, ups=1.12, wpb=495383, bsz=16541.2, num_updates=15500, lr=0.000508001, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=14303 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 425 / 1689 loss=3.78, nll_loss=2.253, ppl=4.77, wps=554452, ups=1.12, wpb=495723, bsz=16586.1, num_updates=15600, lr=0.00050637, gnorm=0.234, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=14393 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 525 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556728, ups=1.12, wpb=496691, bsz=16283.9, num_updates=15700, lr=0.000504754, gnorm=0.232, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14482 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 625 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556578, ups=1.12, wpb=495498, bsz=16831.8, num_updates=15800, lr=0.000503155, gnorm=0.237, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=14571 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 725 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553681, ups=1.12, wpb=495257, bsz=16521.9, num_updates=15900, lr=0.00050157, gnorm=0.233, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=14661 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 epoch 010: 826 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=550989, ups=1.11, wpb=496020, bsz=16537.4, num_updates=16000, lr=0.0005, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14751 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010 | valid on 'valid' subset | loss 3.795 | nll_loss 2.229 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.795 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 926 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=463001, ups=0.93, wpb=496585, bsz=16772.8, num_updates=16100, lr=0.000498445, gnorm=0.226, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=14858 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1026 / 1689 loss=3.792, nll_loss=2.267, ppl=4.81, wps=563683, ups=1.14, wpb=495251, bsz=16563.5, num_updates=16200, lr=0.000496904, gnorm=0.232, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=14946 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1126 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=560799, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=15034 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1226 / 1689 loss=3.788, nll_loss=2.263, ppl=4.8, wps=554838, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.231, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=15123 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1326 / 1689 loss=3.79, nll_loss=2.265, ppl=4.81, wps=555354, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.226, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=15212 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1426 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=553143, ups=1.12, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.234, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=15302 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1526 / 1689 loss=3.79, nll_loss=2.266, ppl=4.81, wps=553194, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.221, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=15391 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 epoch 010: 1627 / 1689 loss=3.779, nll_loss=2.253, ppl=4.77, wps=545603, ups=1.1, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.222, clip=0, loss_scale=2, train_wall=89, gb_free=19.9, wall=15482 end of epoch 10 (average epoch stats below) epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 epoch 010 | loss 3.785 | nll_loss 2.259 | ppl 4.79 | wps 547276 | ups 1.11 | wpb 495116 | bsz 16503.7 | num_updates 16862 | lr 0.000487052 | gnorm 0.229 | clip 0 | loss_scale 2 | train_wall 1481 | gb_free 23.6 | wall 15536 Start iterating over samples epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 38 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=543152, ups=1.11, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.225, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=15572 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 epoch 011: 138 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=556531, ups=1.12, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=21.8, wall=15662 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011 | valid on 'valid' subset | loss 3.796 | nll_loss 2.23 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.795 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 238 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=484096, ups=0.98, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.227, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15764 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 338 / 1689 loss=3.767, nll_loss=2.24, ppl=4.72, wps=552298, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.23, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=15854 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 438 / 1689 loss=3.776, nll_loss=2.249, ppl=4.75, wps=556202, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.23, clip=0, loss_scale=4, train_wall=87, gb_free=22.6, wall=15943 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 539 / 1689 loss=3.776, nll_loss=2.25, ppl=4.76, wps=551393, ups=1.12, wpb=493736, bsz=16601.1, num_updates=17400, lr=0.000479463, gnorm=0.217, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=16032 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 639 / 1689 loss=3.768, nll_loss=2.241, ppl=4.73, wps=565812, ups=1.14, wpb=496576, bsz=16261.8, num_updates=17500, lr=0.000478091, gnorm=0.224, clip=0, loss_scale=2, train_wall=87, gb_free=20.6, wall=16120 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 739 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=558350, ups=1.13, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.229, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=16208 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 839 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=557562, ups=1.12, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.223, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=16297 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 939 / 1689 loss=3.763, nll_loss=2.235, ppl=4.71, wps=557866, ups=1.12, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16386 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1039 / 1689 loss=3.766, nll_loss=2.239, ppl=4.72, wps=551816, ups=1.12, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=16476 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 epoch 011: 1140 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=550435, ups=1.11, wpb=495854, bsz=16595.9, num_updates=18000, lr=0.000471405, gnorm=0.237, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=16566 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011 | valid on 'valid' subset | loss 3.784 | nll_loss 2.212 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.784 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1240 / 1689 loss=3.766, nll_loss=2.238, ppl=4.72, wps=378460, ups=0.76, wpb=495557, bsz=16747.1, num_updates=18100, lr=0.0004701, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=16697 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1340 / 1689 loss=3.77, nll_loss=2.244, ppl=4.74, wps=563466, ups=1.14, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.22, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=16785 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1440 / 1689 loss=3.77, nll_loss=2.243, ppl=4.73, wps=560988, ups=1.14, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=2, train_wall=87, gb_free=21.3, wall=16873 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1540 / 1689 loss=3.771, nll_loss=2.245, ppl=4.74, wps=561247, ups=1.13, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.228, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=16961 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 epoch 011: 1640 / 1689 loss=3.765, nll_loss=2.237, ppl=4.72, wps=561356, ups=1.13, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17050 end of epoch 11 (average epoch stats below) epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 epoch 011 | loss 3.768 | nll_loss 2.24 | ppl 4.72 | wps 536551 | ups 1.08 | wpb 495129 | bsz 16502.1 | num_updates 18548 | lr 0.000464388 | gnorm 0.224 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 22.9 | wall 17092 Start iterating over samples epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 52 / 1689 loss=3.757, nll_loss=2.228, ppl=4.68, wps=549878, ups=1.12, wpb=491232, bsz=16369.8, num_updates=18600, lr=0.000463739, gnorm=0.215, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17139 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 152 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=555430, ups=1.12, wpb=496196, bsz=16264.2, num_updates=18700, lr=0.000462497, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=17228 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 252 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=548647, ups=1.11, wpb=494870, bsz=16690.6, num_updates=18800, lr=0.000461266, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17318 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 352 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=549294, ups=1.11, wpb=494345, bsz=16459.3, num_updates=18900, lr=0.000460044, gnorm=0.225, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=17408 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 epoch 012: 452 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=548950, ups=1.11, wpb=494288, bsz=16417.5, num_updates=19000, lr=0.000458831, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=17498 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012 | valid on 'valid' subset | loss 3.785 | nll_loss 2.218 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.784 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 552 / 1689 loss=3.752, nll_loss=2.223, ppl=4.67, wps=486109, ups=0.98, wpb=495883, bsz=16460.9, num_updates=19100, lr=0.000457629, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=17601 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 653 / 1689 loss=3.749, nll_loss=2.22, ppl=4.66, wps=553561, ups=1.11, wpb=496760, bsz=16695.8, num_updates=19200, lr=0.000456435, gnorm=0.214, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=17690 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 753 / 1689 loss=3.757, nll_loss=2.228, ppl=4.69, wps=556715, ups=1.12, wpb=495899, bsz=16454.3, num_updates=19300, lr=0.000455251, gnorm=0.225, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17779 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 853 / 1689 loss=3.746, nll_loss=2.217, ppl=4.65, wps=551756, ups=1.11, wpb=495923, bsz=16741.9, num_updates=19400, lr=0.000454077, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=17869 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 953 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=550450, ups=1.11, wpb=495202, bsz=16724.2, num_updates=19500, lr=0.000452911, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=17959 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1053 / 1689 loss=3.753, nll_loss=2.224, ppl=4.67, wps=557249, ups=1.12, wpb=495498, bsz=16036.2, num_updates=19600, lr=0.000451754, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=19.1, wall=18048 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1153 / 1689 loss=3.753, nll_loss=2.225, ppl=4.67, wps=552644, ups=1.11, wpb=495890, bsz=16418, num_updates=19700, lr=0.000450606, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=18138 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1254 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=539146, ups=1.09, wpb=494494, bsz=16752.3, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=18230 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1354 / 1689 loss=3.757, nll_loss=2.229, ppl=4.69, wps=552747, ups=1.12, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=18319 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 epoch 012: 1454 / 1689 loss=3.759, nll_loss=2.232, ppl=4.7, wps=553350, ups=1.12, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.213, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=18409 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012 | valid on 'valid' subset | loss 3.76 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.76 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1554 / 1689 loss=3.752, nll_loss=2.224, ppl=4.67, wps=340253, ups=0.69, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18554 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 epoch 012: 1654 / 1689 loss=3.755, nll_loss=2.227, ppl=4.68, wps=564383, ups=1.14, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.216, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=18642 end of epoch 12 (average epoch stats below) epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 epoch 012 | loss 3.753 | nll_loss 2.224 | ppl 4.67 | wps 528219 | ups 1.07 | wpb 495120 | bsz 16505.5 | num_updates 20234 | lr 0.00044462 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1484 | gb_free 23 | wall 18672 Start iterating over samples epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 66 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=549598, ups=1.12, wpb=491711, bsz=16078.6, num_updates=20300, lr=0.000443897, gnorm=0.222, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=18731 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 167 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=559003, ups=1.13, wpb=495143, bsz=16387.8, num_updates=20400, lr=0.000442807, gnorm=0.218, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=18820 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 267 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=559988, ups=1.13, wpb=495098, bsz=16481.4, num_updates=20500, lr=0.000441726, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18908 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 367 / 1689 loss=3.737, nll_loss=2.207, ppl=4.62, wps=556412, ups=1.13, wpb=494360, bsz=16847, num_updates=20600, lr=0.000440653, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=18997 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 467 / 1689 loss=3.744, nll_loss=2.214, ppl=4.64, wps=557163, ups=1.13, wpb=494100, bsz=16485.4, num_updates=20700, lr=0.000439587, gnorm=0.209, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=19086 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 567 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=558672, ups=1.13, wpb=495618, bsz=16063.3, num_updates=20800, lr=0.000438529, gnorm=0.205, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=19175 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 667 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554233, ups=1.12, wpb=496305, bsz=17039.6, num_updates=20900, lr=0.000437479, gnorm=0.212, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=19264 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 epoch 013: 767 / 1689 loss=3.745, nll_loss=2.215, ppl=4.64, wps=556509, ups=1.12, wpb=494803, bsz=16507, num_updates=21000, lr=0.000436436, gnorm=0.217, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=19353 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013 | valid on 'valid' subset | loss 3.769 | nll_loss 2.2 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.76 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 868 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=481224, ups=0.97, wpb=495690, bsz=16751.4, num_updates=21100, lr=0.0004354, gnorm=0.218, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=19456 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 968 / 1689 loss=3.735, nll_loss=2.205, ppl=4.61, wps=557394, ups=1.12, wpb=496294, bsz=16492.2, num_updates=21200, lr=0.000434372, gnorm=0.21, clip=0, loss_scale=1, train_wall=88, gb_free=22.7, wall=19545 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1068 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=560383, ups=1.13, wpb=495287, bsz=16471.4, num_updates=21300, lr=0.000433351, gnorm=0.225, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=19634 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1168 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=553673, ups=1.12, wpb=494232, bsz=16459.7, num_updates=21400, lr=0.000432338, gnorm=0.215, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=19723 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1268 / 1689 loss=3.745, nll_loss=2.217, ppl=4.65, wps=556087, ups=1.12, wpb=495556, bsz=16234, num_updates=21500, lr=0.000431331, gnorm=0.215, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=19812 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1368 / 1689 loss=3.74, nll_loss=2.211, ppl=4.63, wps=551854, ups=1.11, wpb=494992, bsz=16767.5, num_updates=21600, lr=0.000430331, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=19902 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1468 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=553193, ups=1.12, wpb=495675, bsz=16767, num_updates=21700, lr=0.000429339, gnorm=0.209, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=19991 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1568 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=557538, ups=1.13, wpb=495073, bsz=16437, num_updates=21800, lr=0.000428353, gnorm=0.22, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=20080 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 epoch 013: 1668 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=554382, ups=1.12, wpb=495714, bsz=16411.5, num_updates=21900, lr=0.000427374, gnorm=0.209, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=20170 end of epoch 13 (average epoch stats below) epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 epoch 013 | loss 3.74 | nll_loss 2.21 | ppl 4.63 | wps 551386 | ups 1.11 | wpb 495104 | bsz 16507 | num_updates 21921 | lr 0.000427169 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1477 | gb_free 23.2 | wall 20187 Start iterating over samples epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 epoch 014: 79 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=553140, ups=1.12, wpb=492338, bsz=16229.8, num_updates=22000, lr=0.000426401, gnorm=0.208, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=20259 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014 | valid on 'valid' subset | loss 3.763 | nll_loss 2.195 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.76 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 179 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=487719, ups=0.98, wpb=497567, bsz=16512.9, num_updates=22100, lr=0.000425436, gnorm=0.206, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20361 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 280 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=551222, ups=1.11, wpb=496784, bsz=16352.3, num_updates=22200, lr=0.000424476, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=20451 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 380 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=556599, ups=1.12, wpb=495805, bsz=16745.4, num_updates=22300, lr=0.000423524, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=20540 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 480 / 1689 loss=3.728, nll_loss=2.197, ppl=4.59, wps=552637, ups=1.12, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.21, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=20629 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 581 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=548841, ups=1.11, wpb=495554, bsz=16470, num_updates=22500, lr=0.000421637, gnorm=0.214, clip=0, loss_scale=1, train_wall=90, gb_free=21.5, wall=20720 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 681 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=553708, ups=1.12, wpb=494127, bsz=16356.6, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=20809 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 781 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=556197, ups=1.12, wpb=494799, bsz=16435.1, num_updates=22700, lr=0.000419775, gnorm=0.212, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=20898 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 881 / 1689 loss=3.726, nll_loss=2.195, ppl=4.58, wps=557398, ups=1.12, wpb=495718, bsz=16602.4, num_updates=22800, lr=0.000418854, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=20987 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 981 / 1689 loss=3.731, nll_loss=2.2, ppl=4.6, wps=551764, ups=1.12, wpb=494383, bsz=16759.3, num_updates=22900, lr=0.000417938, gnorm=0.211, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=21076 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 epoch 014: 1081 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=555780, ups=1.12, wpb=494678, bsz=16565.8, num_updates=23000, lr=0.000417029, gnorm=0.214, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=21165 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014 | valid on 'valid' subset | loss 3.754 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.754 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1181 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=388982, ups=0.79, wpb=494110, bsz=16510.6, num_updates=23100, lr=0.000416125, gnorm=0.215, clip=0, loss_scale=2, train_wall=103, gb_free=22.7, wall=21292 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1281 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=560285, ups=1.13, wpb=496027, bsz=16215.6, num_updates=23200, lr=0.000415227, gnorm=0.202, clip=0, loss_scale=2, train_wall=87, gb_free=21.9, wall=21381 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1381 / 1689 loss=3.734, nll_loss=2.204, ppl=4.61, wps=555364, ups=1.12, wpb=495462, bsz=16346.1, num_updates=23300, lr=0.000414335, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=21470 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1481 / 1689 loss=3.733, nll_loss=2.203, ppl=4.6, wps=552691, ups=1.12, wpb=495065, bsz=16698, num_updates=23400, lr=0.000413449, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=21560 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1581 / 1689 loss=3.722, nll_loss=2.191, ppl=4.57, wps=554989, ups=1.12, wpb=496915, bsz=16732.2, num_updates=23500, lr=0.000412568, gnorm=0.218, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=21649 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 epoch 014: 1682 / 1689 loss=3.737, nll_loss=2.208, ppl=4.62, wps=547720, ups=1.11, wpb=493496, bsz=16372.6, num_updates=23600, lr=0.000411693, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=21739 end of epoch 14 (average epoch stats below) epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 epoch 014 | loss 3.728 | nll_loss 2.197 | ppl 4.58 | wps 535856 | ups 1.08 | wpb 495144 | bsz 16504.3 | num_updates 23607 | lr 0.000411632 | gnorm 0.208 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 23.9 | wall 21745 Start iterating over samples epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 94 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=539535, ups=1.1, wpb=490306, bsz=16242.1, num_updates=23700, lr=0.000410824, gnorm=0.22, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=21830 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 194 / 1689 loss=3.712, nll_loss=2.179, ppl=4.53, wps=552246, ups=1.12, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=21920 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 294 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=556432, ups=1.12, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=22009 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 epoch 015: 394 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=556689, ups=1.12, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=22098 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015 | valid on 'valid' subset | loss 3.756 | nll_loss 2.191 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.754 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 494 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=482554, ups=0.97, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=22201 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 594 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=552176, ups=1.12, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=22290 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 695 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=544946, ups=1.1, wpb=495179, bsz=16695.8, num_updates=24300, lr=0.00040572, gnorm=0.207, clip=0, loss_scale=1, train_wall=90, gb_free=22.1, wall=22381 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 795 / 1689 loss=3.718, nll_loss=2.186, ppl=4.55, wps=556653, ups=1.12, wpb=495298, bsz=16288.4, num_updates=24400, lr=0.000404888, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=22470 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 895 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=559069, ups=1.13, wpb=494400, bsz=16271, num_updates=24500, lr=0.000404061, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22559 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 995 / 1689 loss=3.724, nll_loss=2.193, ppl=4.57, wps=561046, ups=1.13, wpb=497589, bsz=16252.8, num_updates=24600, lr=0.000403239, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22647 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1095 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=555644, ups=1.12, wpb=495022, bsz=16755.6, num_updates=24700, lr=0.000402422, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=22736 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1195 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=557240, ups=1.13, wpb=493832, bsz=16218.3, num_updates=24800, lr=0.00040161, gnorm=0.214, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=22825 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1295 / 1689 loss=3.713, nll_loss=2.181, ppl=4.54, wps=553438, ups=1.12, wpb=495357, bsz=17017.6, num_updates=24900, lr=0.000400802, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=22915 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 epoch 015: 1395 / 1689 loss=3.717, nll_loss=2.186, ppl=4.55, wps=553339, ups=1.11, wpb=497008, bsz=16499, num_updates=25000, lr=0.0004, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=23004 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015 | valid on 'valid' subset | loss 3.744 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.744 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1495 / 1689 loss=3.719, nll_loss=2.188, ppl=4.56, wps=379662, ups=0.77, wpb=494793, bsz=16897.5, num_updates=25100, lr=0.000399202, gnorm=0.2, clip=0, loss_scale=2, train_wall=86, gb_free=22.2, wall=23135 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 epoch 015: 1595 / 1689 loss=3.726, nll_loss=2.196, ppl=4.58, wps=562023, ups=1.13, wpb=496171, bsz=16480.2, num_updates=25200, lr=0.00039841, gnorm=0.215, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=23223 end of epoch 15 (average epoch stats below) epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 epoch 015 | loss 3.717 | nll_loss 2.185 | ppl 4.55 | wps 535230 | ups 1.08 | wpb 495128 | bsz 16507.5 | num_updates 25293 | lr 0.000397676 | gnorm 0.204 | clip 0 | loss_scale 2 | train_wall 1480 | gb_free 24.3 | wall 23305 Start iterating over samples epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 7 / 1689 loss=3.725, nll_loss=2.194, ppl=4.58, wps=527721, ups=1.07, wpb=491709, bsz=16043.2, num_updates=25300, lr=0.000397621, gnorm=0.195, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23316 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 107 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=559248, ups=1.13, wpb=495002, bsz=16321.1, num_updates=25400, lr=0.000396838, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=23405 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 207 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=557722, ups=1.12, wpb=496509, bsz=16657, num_updates=25500, lr=0.000396059, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=23494 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 307 / 1689 loss=3.707, nll_loss=2.174, ppl=4.51, wps=552114, ups=1.12, wpb=493618, bsz=16222.7, num_updates=25600, lr=0.000395285, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=22.5, wall=23583 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 408 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=551591, ups=1.11, wpb=495043, bsz=16168.7, num_updates=25700, lr=0.000394515, gnorm=0.201, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=23673 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 508 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=556522, ups=1.12, wpb=496175, bsz=16195.5, num_updates=25800, lr=0.00039375, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=23762 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 608 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=552212, ups=1.12, wpb=494828, bsz=16639.2, num_updates=25900, lr=0.000392989, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=23852 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 epoch 016: 708 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=550496, ups=1.11, wpb=494080, bsz=16661.8, num_updates=26000, lr=0.000392232, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=23941 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016 | valid on 'valid' subset | loss 3.744 | nll_loss 2.182 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.744 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 808 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=453564, ups=0.92, wpb=494230, bsz=16607.3, num_updates=26100, lr=0.00039148, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=24050 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 908 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=557175, ups=1.12, wpb=495695, bsz=16647.6, num_updates=26200, lr=0.000390732, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=24139 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1008 / 1689 loss=3.707, nll_loss=2.175, ppl=4.51, wps=557678, ups=1.12, wpb=495793, bsz=16314, num_updates=26300, lr=0.000389989, gnorm=0.196, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=24228 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1109 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=550456, ups=1.11, wpb=496281, bsz=16550, num_updates=26400, lr=0.000389249, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24318 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1209 / 1689 loss=3.711, nll_loss=2.18, ppl=4.53, wps=553486, ups=1.12, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=24408 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1309 / 1689 loss=3.708, nll_loss=2.176, ppl=4.52, wps=549809, ups=1.11, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=24498 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1409 / 1689 loss=3.707, nll_loss=2.175, ppl=4.52, wps=552408, ups=1.12, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=24588 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1509 / 1689 loss=3.711, nll_loss=2.179, ppl=4.53, wps=556161, ups=1.12, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=24677 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 epoch 016: 1609 / 1689 loss=3.71, nll_loss=2.178, ppl=4.52, wps=560124, ups=1.13, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=22.3, wall=24766 end of epoch 16 (average epoch stats below) epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 epoch 016 | loss 3.708 | nll_loss 2.175 | ppl 4.52 | wps 545641 | ups 1.1 | wpb 495120 | bsz 16503.8 | num_updates 26980 | lr 0.000385043 | gnorm 0.199 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.3 | wall 24836 Start iterating over samples epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 epoch 017: 20 / 1689 loss=3.709, nll_loss=2.177, ppl=4.52, wps=548142, ups=1.12, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.207, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=24855 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017 | valid on 'valid' subset | loss 3.745 | nll_loss 2.177 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.744 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 120 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=381499, ups=0.77, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.2, wall=24985 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 220 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=561256, ups=1.13, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.21, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25073 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 320 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=557123, ups=1.13, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.192, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=25162 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 420 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=558314, ups=1.13, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=25251 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 521 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=559684, ups=1.13, wpb=494288, bsz=16433, num_updates=27500, lr=0.000381385, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=25339 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 621 / 1689 loss=3.702, nll_loss=2.168, ppl=4.5, wps=558068, ups=1.13, wpb=493574, bsz=16241.4, num_updates=27600, lr=0.000380693, gnorm=0.199, clip=0, loss_scale=2, train_wall=87, gb_free=22.6, wall=25428 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 721 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=558415, ups=1.13, wpb=495198, bsz=16276.2, num_updates=27700, lr=0.000380006, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=25516 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 822 / 1689 loss=3.701, nll_loss=2.168, ppl=4.49, wps=553223, ups=1.11, wpb=496888, bsz=17029.5, num_updates=27800, lr=0.000379322, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25606 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 922 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=553711, ups=1.12, wpb=495411, bsz=16672.9, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25696 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 epoch 017: 1022 / 1689 loss=3.705, nll_loss=2.172, ppl=4.51, wps=552512, ups=1.12, wpb=494784, bsz=16371.9, num_updates=28000, lr=0.000377964, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25785 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017 | valid on 'valid' subset | loss 3.74 | nll_loss 2.176 | ppl 4.52 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.74 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1122 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=456067, ups=0.92, wpb=496626, bsz=16690.5, num_updates=28100, lr=0.000377291, gnorm=0.197, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=25894 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1222 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=556819, ups=1.12, wpb=495186, bsz=16333.9, num_updates=28200, lr=0.000376622, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=25983 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1322 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=553890, ups=1.12, wpb=494909, bsz=16403.8, num_updates=28300, lr=0.000375956, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=26072 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1423 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=548625, ups=1.11, wpb=495498, bsz=16492.5, num_updates=28400, lr=0.000375293, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26163 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1523 / 1689 loss=3.699, nll_loss=2.166, ppl=4.49, wps=557341, ups=1.12, wpb=496729, bsz=16691.4, num_updates=28500, lr=0.000374634, gnorm=0.209, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=26252 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 epoch 017: 1623 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=548962, ups=1.11, wpb=495626, bsz=16864.9, num_updates=28600, lr=0.000373979, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=26342 end of epoch 17 (average epoch stats below) epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 epoch 017 | loss 3.699 | nll_loss 2.166 | ppl 4.49 | wps 533842 | ups 1.08 | wpb 495113 | bsz 16505.7 | num_updates 28666 | lr 0.000373548 | gnorm 0.198 | clip 0 | loss_scale 1 | train_wall 1481 | gb_free 21.4 | wall 26399 Start iterating over samples epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 34 / 1689 loss=3.696, nll_loss=2.162, ppl=4.48, wps=555884, ups=1.13, wpb=490199, bsz=16047, num_updates=28700, lr=0.000373327, gnorm=0.196, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=26430 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 134 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=554384, ups=1.12, wpb=494983, bsz=16423.6, num_updates=28800, lr=0.000372678, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=26520 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 234 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=558889, ups=1.13, wpb=495985, bsz=16379.2, num_updates=28900, lr=0.000372033, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.2, wall=26608 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 epoch 018: 335 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=549882, ups=1.11, wpb=495861, bsz=16554.7, num_updates=29000, lr=0.000371391, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=26698 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.732 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.732 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 435 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453400, ups=0.91, wpb=498460, bsz=16628.4, num_updates=29100, lr=0.000370752, gnorm=0.208, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=26808 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 535 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=551958, ups=1.11, wpb=495571, bsz=16509.4, num_updates=29200, lr=0.000370117, gnorm=0.199, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=26898 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 635 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=558347, ups=1.13, wpb=495853, bsz=16322.2, num_updates=29300, lr=0.000369484, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=26987 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 735 / 1689 loss=3.692, nll_loss=2.158, ppl=4.46, wps=563888, ups=1.14, wpb=495670, bsz=16543, num_updates=29400, lr=0.000368856, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=27075 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 835 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=560432, ups=1.13, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=27163 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 936 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=555238, ups=1.12, wpb=496178, bsz=16330.6, num_updates=29600, lr=0.000367607, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=27253 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1036 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=556827, ups=1.13, wpb=494292, bsz=16337, num_updates=29700, lr=0.000366988, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=27341 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1136 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=555669, ups=1.12, wpb=495597, bsz=16644.3, num_updates=29800, lr=0.000366372, gnorm=0.204, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=27431 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1236 / 1689 loss=3.69, nll_loss=2.156, ppl=4.46, wps=551428, ups=1.11, wpb=495469, bsz=16643, num_updates=29900, lr=0.000365758, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=27520 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 epoch 018: 1336 / 1689 loss=3.691, nll_loss=2.157, ppl=4.46, wps=557968, ups=1.13, wpb=495090, bsz=16285.4, num_updates=30000, lr=0.000365148, gnorm=0.201, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=27609 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018 | valid on 'valid' subset | loss 3.733 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.732 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1436 / 1689 loss=3.697, nll_loss=2.164, ppl=4.48, wps=419005, ups=0.85, wpb=495217, bsz=16364.5, num_updates=30100, lr=0.000364541, gnorm=0.199, clip=0, loss_scale=2, train_wall=101, gb_free=22, wall=27727 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1536 / 1689 loss=3.698, nll_loss=2.165, ppl=4.49, wps=554252, ups=1.12, wpb=495062, bsz=16302.6, num_updates=30200, lr=0.000363937, gnorm=0.188, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=27817 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 epoch 018: 1637 / 1689 loss=3.705, nll_loss=2.173, ppl=4.51, wps=543387, ups=1.1, wpb=492397, bsz=16873, num_updates=30300, lr=0.000363336, gnorm=0.198, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=27907 end of epoch 18 (average epoch stats below) epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 epoch 018 | loss 3.692 | nll_loss 2.157 | ppl 4.46 | wps 537469 | ups 1.09 | wpb 495121 | bsz 16505.4 | num_updates 30352 | lr 0.000363025 | gnorm 0.197 | clip 0 | loss_scale 1 | train_wall 1494 | gb_free 22.9 | wall 27953 Start iterating over samples epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 48 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=548338, ups=1.11, wpb=492033, bsz=16509.7, num_updates=30400, lr=0.000362738, gnorm=0.194, clip=0, loss_scale=1, train_wall=86, gb_free=22.1, wall=27997 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 148 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551660, ups=1.11, wpb=496307, bsz=16898.5, num_updates=30500, lr=0.000362143, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=28087 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 248 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=556020, ups=1.12, wpb=496589, bsz=16536.1, num_updates=30600, lr=0.000361551, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=28176 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 348 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=551543, ups=1.11, wpb=495948, bsz=16655.1, num_updates=30700, lr=0.000360961, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=28266 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 449 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=550809, ups=1.11, wpb=495816, bsz=16078.8, num_updates=30800, lr=0.000360375, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=28356 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 550 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=554029, ups=1.12, wpb=496604, bsz=16621.3, num_updates=30900, lr=0.000359791, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=28446 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 epoch 019: 650 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=557684, ups=1.12, wpb=495808, bsz=16917.8, num_updates=31000, lr=0.000359211, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28535 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019 | valid on 'valid' subset | loss 3.74 | nll_loss 2.172 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.732 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 750 / 1689 loss=3.683, nll_loss=2.148, ppl=4.43, wps=493140, ups=1, wpb=494957, bsz=16630.5, num_updates=31100, lr=0.000358633, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28635 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 850 / 1689 loss=3.684, nll_loss=2.149, ppl=4.44, wps=558731, ups=1.13, wpb=494708, bsz=16408.8, num_updates=31200, lr=0.000358057, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.5, wall=28724 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 950 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=560557, ups=1.13, wpb=496298, bsz=16241.1, num_updates=31300, lr=0.000357485, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.3, wall=28812 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1050 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=558315, ups=1.13, wpb=494068, bsz=16375.8, num_updates=31400, lr=0.000356915, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=28901 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1152 / 1689 loss=3.693, nll_loss=2.159, ppl=4.47, wps=549192, ups=1.11, wpb=493711, bsz=16328.6, num_updates=31500, lr=0.000356348, gnorm=0.202, clip=0, loss_scale=0.25, train_wall=89, gb_free=22.1, wall=28991 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1252 / 1689 loss=3.695, nll_loss=2.161, ppl=4.47, wps=558850, ups=1.13, wpb=494665, bsz=16832.2, num_updates=31600, lr=0.000355784, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21, wall=29079 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1352 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=557421, ups=1.13, wpb=495043, bsz=16730.8, num_updates=31700, lr=0.000355222, gnorm=0.199, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.9, wall=29168 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1452 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=555441, ups=1.12, wpb=495386, bsz=16202.8, num_updates=31800, lr=0.000354663, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.8, wall=29257 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1552 / 1689 loss=3.681, nll_loss=2.146, ppl=4.42, wps=558253, ups=1.13, wpb=495369, bsz=16235, num_updates=31900, lr=0.000354107, gnorm=0.196, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.4, wall=29346 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 epoch 019: 1652 / 1689 loss=3.689, nll_loss=2.156, ppl=4.46, wps=555623, ups=1.12, wpb=495026, bsz=16585.4, num_updates=32000, lr=0.000353553, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29435 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 epoch 019 | valid on 'valid' subset | loss 3.725 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.725 end of epoch 19 (average epoch stats below) epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 epoch 019 | loss 3.684 | nll_loss 2.149 | ppl 4.44 | wps 536035 | ups 1.08 | wpb 495116 | bsz 16503.9 | num_updates 32037 | lr 0.000353349 | gnorm 0.195 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 23.7 | wall 29509 Start iterating over samples epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 63 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=376528, ups=0.77, wpb=490853, bsz=16319.9, num_updates=32100, lr=0.000353002, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29565 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 163 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=557068, ups=1.12, wpb=495858, bsz=16221.4, num_updates=32200, lr=0.000352454, gnorm=0.196, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=29654 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 263 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=559053, ups=1.13, wpb=495010, bsz=16435, num_updates=32300, lr=0.000351908, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=29743 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 363 / 1689 loss=3.665, nll_loss=2.128, ppl=4.37, wps=555001, ups=1.12, wpb=494956, bsz=16683, num_updates=32400, lr=0.000351364, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=23.1, wall=29832 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 463 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=554440, ups=1.11, wpb=497584, bsz=16615.5, num_updates=32500, lr=0.000350823, gnorm=0.197, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=29922 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 563 / 1689 loss=3.678, nll_loss=2.143, ppl=4.42, wps=550333, ups=1.11, wpb=494652, bsz=16816.3, num_updates=32600, lr=0.000350285, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=30012 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 663 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551418, ups=1.11, wpb=495254, bsz=16400.7, num_updates=32700, lr=0.000349749, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.2, wall=30102 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 763 / 1689 loss=3.678, nll_loss=2.142, ppl=4.42, wps=551346, ups=1.11, wpb=496691, bsz=16944, num_updates=32800, lr=0.000349215, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=30192 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 863 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=553064, ups=1.12, wpb=494102, bsz=16427.8, num_updates=32900, lr=0.000348684, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=30281 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 epoch 020: 963 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=553660, ups=1.12, wpb=494416, bsz=16436.9, num_updates=33000, lr=0.000348155, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=30370 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020 | valid on 'valid' subset | loss 3.731 | nll_loss 2.169 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.725 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1063 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=471391, ups=0.95, wpb=495090, bsz=16377.8, num_updates=33100, lr=0.000347629, gnorm=0.2, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=30475 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1163 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=551967, ups=1.12, wpb=494608, bsz=16617.4, num_updates=33200, lr=0.000347105, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=30565 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1263 / 1689 loss=3.681, nll_loss=2.147, ppl=4.43, wps=553610, ups=1.12, wpb=495067, bsz=16579, num_updates=33300, lr=0.000346583, gnorm=0.189, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=30654 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1363 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=558991, ups=1.13, wpb=494951, bsz=16409, num_updates=33400, lr=0.000346064, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=30743 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1464 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=551436, ups=1.11, wpb=494895, bsz=16511, num_updates=33500, lr=0.000345547, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=30833 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1564 / 1689 loss=3.682, nll_loss=2.147, ppl=4.43, wps=558698, ups=1.13, wpb=494862, bsz=16193, num_updates=33600, lr=0.000345033, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=30921 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 epoch 020: 1664 / 1689 loss=3.68, nll_loss=2.146, ppl=4.42, wps=561897, ups=1.13, wpb=496629, bsz=16561.5, num_updates=33700, lr=0.00034452, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=31010 end of epoch 20 (average epoch stats below) epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 epoch 020 | loss 3.678 | nll_loss 2.142 | ppl 4.41 | wps 549062 | ups 1.11 | wpb 495112 | bsz 16504.3 | num_updates 33725 | lr 0.000344393 | gnorm 0.193 | clip 0 | loss_scale 1 | train_wall 1482 | gb_free 23.5 | wall 31031 Start iterating over samples epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 75 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=557610, ups=1.13, wpb=491870, bsz=16241.5, num_updates=33800, lr=0.00034401, gnorm=0.198, clip=0, loss_scale=1, train_wall=86, gb_free=21.8, wall=31098 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 175 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=558684, ups=1.13, wpb=494997, bsz=16453.8, num_updates=33900, lr=0.000343503, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=31186 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 epoch 021: 277 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=543409, ups=1.1, wpb=495722, bsz=16539.2, num_updates=34000, lr=0.000342997, gnorm=0.19, clip=0, loss_scale=0.5, train_wall=90, gb_free=21.6, wall=31278 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.724 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 377 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=454611, ups=0.92, wpb=494130, bsz=16805.8, num_updates=34100, lr=0.000342494, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=31386 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 477 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=555317, ups=1.12, wpb=494740, bsz=16670.6, num_updates=34200, lr=0.000341993, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=31475 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 577 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=556045, ups=1.12, wpb=494497, bsz=16675.1, num_updates=34300, lr=0.000341494, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=31564 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 677 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=553636, ups=1.12, wpb=495525, bsz=16492.6, num_updates=34400, lr=0.000340997, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=31654 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 777 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=555550, ups=1.12, wpb=494496, bsz=16663.3, num_updates=34500, lr=0.000340503, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=31743 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 877 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=558125, ups=1.13, wpb=495680, bsz=16222.6, num_updates=34600, lr=0.00034001, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=31832 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 977 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=552571, ups=1.11, wpb=496336, bsz=16280.8, num_updates=34700, lr=0.00033952, gnorm=0.2, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31922 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1077 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=555028, ups=1.12, wpb=495070, bsz=16449.7, num_updates=34800, lr=0.000339032, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32011 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1177 / 1689 loss=3.675, nll_loss=2.139, ppl=4.4, wps=552274, ups=1.11, wpb=495410, bsz=16737, num_updates=34900, lr=0.000338546, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=32100 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 epoch 021: 1277 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=554892, ups=1.12, wpb=494735, bsz=16685.6, num_updates=35000, lr=0.000338062, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=32190 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021 | valid on 'valid' subset | loss 3.724 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.724 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1377 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=376109, ups=0.76, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.194, clip=0, loss_scale=2, train_wall=105, gb_free=22.7, wall=32321 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1477 / 1689 loss=3.675, nll_loss=2.14, ppl=4.41, wps=557036, ups=1.12, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=32411 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1577 / 1689 loss=3.679, nll_loss=2.144, ppl=4.42, wps=552076, ups=1.11, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=32500 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 epoch 021: 1678 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547810, ups=1.11, wpb=494630, bsz=16527.3, num_updates=35400, lr=0.000336146, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=32591 end of epoch 21 (average epoch stats below) epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 epoch 021 | loss 3.672 | nll_loss 2.135 | ppl 4.39 | wps 532009 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35411 | lr 0.000336094 | gnorm 0.192 | clip 0 | loss_scale 1 | train_wall 1496 | gb_free 24 | wall 32600 Start iterating over samples epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 89 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=538857, ups=1.1, wpb=491112, bsz=16526, num_updates=35500, lr=0.000335673, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32682 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 189 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=550578, ups=1.12, wpb=493570, bsz=16081.5, num_updates=35600, lr=0.000335201, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=32772 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 289 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=558159, ups=1.13, wpb=494230, bsz=16687.9, num_updates=35700, lr=0.000334731, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=32860 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 389 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=559990, ups=1.13, wpb=495664, bsz=16765, num_updates=35800, lr=0.000334263, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=32949 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 489 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=557653, ups=1.13, wpb=495564, bsz=16555, num_updates=35900, lr=0.000333797, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33038 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 epoch 022: 590 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551726, ups=1.11, wpb=495748, bsz=16567, num_updates=36000, lr=0.000333333, gnorm=0.185, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=33127 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022 | valid on 'valid' subset | loss 3.715 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.715 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 690 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=462301, ups=0.93, wpb=495734, bsz=16464.4, num_updates=36100, lr=0.000332871, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=33235 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 790 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=556753, ups=1.13, wpb=494668, bsz=16353.9, num_updates=36200, lr=0.000332411, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=33323 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 890 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=556245, ups=1.12, wpb=494578, bsz=16751, num_updates=36300, lr=0.000331953, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33412 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 990 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=558226, ups=1.12, wpb=496460, bsz=16477.8, num_updates=36400, lr=0.000331497, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33501 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1090 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=558618, ups=1.13, wpb=494926, bsz=16594.7, num_updates=36500, lr=0.000331042, gnorm=0.198, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=33590 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1191 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=553605, ups=1.12, wpb=495896, bsz=16395.8, num_updates=36600, lr=0.00033059, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=33679 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1291 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=552629, ups=1.11, wpb=496909, bsz=16469.3, num_updates=36700, lr=0.000330139, gnorm=0.194, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=33769 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1391 / 1689 loss=3.676, nll_loss=2.141, ppl=4.41, wps=557454, ups=1.12, wpb=496065, bsz=16681.8, num_updates=36800, lr=0.00032969, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=33858 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1491 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=553289, ups=1.12, wpb=495150, bsz=16251.7, num_updates=36900, lr=0.000329243, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=33948 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 epoch 022: 1591 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=552590, ups=1.12, wpb=495434, bsz=16614.4, num_updates=37000, lr=0.000328798, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=34038 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 epoch 022 | valid on 'valid' subset | loss 3.713 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.713 end of epoch 22 (average epoch stats below) epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 epoch 022 | loss 3.666 | nll_loss 2.129 | ppl 4.37 | wps 524001 | ups 1.06 | wpb 495112 | bsz 16506.4 | num_updates 37097 | lr 0.000328368 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1480 | gb_free 22.7 | wall 34193 Start iterating over samples epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 3 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=309166, ups=0.63, wpb=491588, bsz=16390.2, num_updates=37100, lr=0.000328355, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=34197 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 103 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=568209, ups=1.15, wpb=495372, bsz=16652.6, num_updates=37200, lr=0.000327913, gnorm=0.185, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=34284 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 204 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=551127, ups=1.11, wpb=495090, bsz=17038, num_updates=37300, lr=0.000327473, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=34374 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 304 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=563751, ups=1.13, wpb=496831, bsz=16660.2, num_updates=37400, lr=0.000327035, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=34462 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 404 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=563270, ups=1.14, wpb=495988, bsz=16681.8, num_updates=37500, lr=0.000326599, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.5, wall=34550 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 504 / 1689 loss=3.656, nll_loss=2.118, ppl=4.34, wps=558806, ups=1.13, wpb=495221, bsz=16698.6, num_updates=37600, lr=0.000326164, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=34638 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 604 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=557532, ups=1.13, wpb=494594, bsz=16185.8, num_updates=37700, lr=0.000325731, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=34727 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 704 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=550909, ups=1.11, wpb=494594, bsz=16524.2, num_updates=37800, lr=0.0003253, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=34817 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 804 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556381, ups=1.12, wpb=495985, bsz=16535.8, num_updates=37900, lr=0.000324871, gnorm=0.191, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=34906 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 epoch 023: 904 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=554675, ups=1.12, wpb=495087, bsz=16887.6, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=34995 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023 | valid on 'valid' subset | loss 3.713 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.713 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1004 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=443986, ups=0.9, wpb=495296, bsz=16199.4, num_updates=38100, lr=0.000324017, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=35107 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1104 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=559005, ups=1.13, wpb=494254, bsz=16301.7, num_updates=38200, lr=0.000323592, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35195 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1205 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=556198, ups=1.12, wpb=496018, bsz=16171.3, num_updates=38300, lr=0.00032317, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=35284 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1305 / 1689 loss=3.668, nll_loss=2.132, ppl=4.38, wps=559892, ups=1.13, wpb=495165, bsz=16558.5, num_updates=38400, lr=0.000322749, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=35373 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1405 / 1689 loss=3.661, nll_loss=2.124, ppl=4.36, wps=554222, ups=1.12, wpb=494554, bsz=16492.3, num_updates=38500, lr=0.000322329, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=35462 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1505 / 1689 loss=3.658, nll_loss=2.121, ppl=4.35, wps=546409, ups=1.1, wpb=494709, bsz=16550.6, num_updates=38600, lr=0.000321911, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=35553 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 epoch 023: 1605 / 1689 loss=3.663, nll_loss=2.127, ppl=4.37, wps=556157, ups=1.12, wpb=496558, bsz=16291, num_updates=38700, lr=0.000321495, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=35642 end of epoch 23 (average epoch stats below) epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 epoch 023 | loss 3.661 | nll_loss 2.123 | ppl 4.36 | wps 548489 | ups 1.11 | wpb 495125 | bsz 16506.3 | num_updates 38784 | lr 0.000321147 | gnorm 0.19 | clip 0 | loss_scale 1 | train_wall 1477 | gb_free 23.7 | wall 35716 Start iterating over samples epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 16 / 1689 loss=3.664, nll_loss=2.128, ppl=4.37, wps=552336, ups=1.12, wpb=491748, bsz=16232.1, num_updates=38800, lr=0.000321081, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=35731 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 116 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=556630, ups=1.12, wpb=496300, bsz=16697.8, num_updates=38900, lr=0.000320668, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=22.1, wall=35820 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 epoch 024: 217 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=548305, ups=1.11, wpb=496081, bsz=16784.4, num_updates=39000, lr=0.000320256, gnorm=0.2, clip=0, loss_scale=1, train_wall=89, gb_free=22.5, wall=35911 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.735 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.713 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 317 / 1689 loss=3.648, nll_loss=2.109, ppl=4.32, wps=486356, ups=0.98, wpb=495508, bsz=16228.5, num_updates=39100, lr=0.000319847, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=36013 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 417 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=557238, ups=1.13, wpb=494620, bsz=16434, num_updates=39200, lr=0.000319438, gnorm=0.196, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=36101 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 517 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=557472, ups=1.12, wpb=496996, bsz=16575.6, num_updates=39300, lr=0.000319032, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36190 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 617 / 1689 loss=3.656, nll_loss=2.119, ppl=4.34, wps=557473, ups=1.13, wpb=494765, bsz=16406.1, num_updates=39400, lr=0.000318626, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=36279 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 717 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=555038, ups=1.12, wpb=494274, bsz=16267.9, num_updates=39500, lr=0.000318223, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=36368 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 818 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=550904, ups=1.11, wpb=496509, bsz=16343.9, num_updates=39600, lr=0.000317821, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=36458 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 918 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=555449, ups=1.12, wpb=495991, bsz=16640.7, num_updates=39700, lr=0.00031742, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=36548 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1018 / 1689 loss=3.659, nll_loss=2.122, ppl=4.35, wps=556530, ups=1.12, wpb=495134, bsz=16588.6, num_updates=39800, lr=0.000317021, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=36637 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1118 / 1689 loss=3.657, nll_loss=2.12, ppl=4.35, wps=557037, ups=1.12, wpb=496322, bsz=16934.7, num_updates=39900, lr=0.000316624, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=36726 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 epoch 024: 1218 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=555339, ups=1.12, wpb=493866, bsz=16680, num_updates=40000, lr=0.000316228, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=36815 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024 | valid on 'valid' subset | loss 3.72 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.713 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1319 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=368180, ups=0.74, wpb=494995, bsz=16512.7, num_updates=40100, lr=0.000315833, gnorm=0.188, clip=0, loss_scale=1, train_wall=116, gb_free=21.8, wall=36949 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1419 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=548054, ups=1.11, wpb=494282, bsz=16245.3, num_updates=40200, lr=0.00031544, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.7, wall=37039 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1519 / 1689 loss=3.666, nll_loss=2.129, ppl=4.38, wps=550878, ups=1.11, wpb=495190, bsz=16424.3, num_updates=40300, lr=0.000315049, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=37129 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 epoch 024: 1619 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=557066, ups=1.12, wpb=495908, bsz=16533, num_updates=40400, lr=0.000314658, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=37218 end of epoch 24 (average epoch stats below) epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 epoch 024 | loss 3.656 | nll_loss 2.118 | ppl 4.34 | wps 533719 | ups 1.08 | wpb 495122 | bsz 16507.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.188 | clip 0 | loss_scale 1 | train_wall 1507 | gb_free 23 | wall 37280 Start iterating over samples epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 30 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=542566, ups=1.11, wpb=490866, bsz=16193.4, num_updates=40500, lr=0.00031427, gnorm=0.197, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=37309 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 130 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=553229, ups=1.12, wpb=495116, bsz=16375, num_updates=40600, lr=0.000313882, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37398 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 231 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=547102, ups=1.1, wpb=496680, bsz=16554.4, num_updates=40700, lr=0.000313497, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=22.4, wall=37489 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 331 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=555295, ups=1.12, wpb=496417, bsz=16701.5, num_updates=40800, lr=0.000313112, gnorm=0.196, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=37578 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 431 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=556325, ups=1.12, wpb=495486, bsz=16478.2, num_updates=40900, lr=0.000312729, gnorm=0.193, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=37667 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 epoch 025: 531 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=552291, ups=1.12, wpb=495016, bsz=16582.1, num_updates=41000, lr=0.000312348, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=37757 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025 | valid on 'valid' subset | loss 3.722 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.713 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 631 / 1689 loss=3.654, nll_loss=2.116, ppl=4.33, wps=486959, ups=0.98, wpb=494980, bsz=16458.7, num_updates=41100, lr=0.000311967, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.4, wall=37859 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 732 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=544879, ups=1.1, wpb=495368, bsz=16695.4, num_updates=41200, lr=0.000311588, gnorm=0.181, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=37950 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 832 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=552384, ups=1.12, wpb=495409, bsz=16381.9, num_updates=41300, lr=0.000311211, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=38039 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 932 / 1689 loss=3.66, nll_loss=2.124, ppl=4.36, wps=561988, ups=1.14, wpb=494753, bsz=16877, num_updates=41400, lr=0.000310835, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=38127 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1032 / 1689 loss=3.654, nll_loss=2.116, ppl=4.34, wps=558882, ups=1.13, wpb=494061, bsz=16438.3, num_updates=41500, lr=0.00031046, gnorm=0.184, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=38216 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1132 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=556853, ups=1.13, wpb=494760, bsz=16134.6, num_updates=41600, lr=0.000310087, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=38305 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1232 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=558485, ups=1.13, wpb=495084, bsz=16419.2, num_updates=41700, lr=0.000309715, gnorm=0.183, clip=0, loss_scale=2, train_wall=87, gb_free=22.5, wall=38393 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1334 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=546648, ups=1.1, wpb=496922, bsz=16608.6, num_updates=41800, lr=0.000309344, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=38484 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1434 / 1689 loss=3.652, nll_loss=2.114, ppl=4.33, wps=553451, ups=1.12, wpb=495522, bsz=16504, num_updates=41900, lr=0.000308975, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=38574 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 epoch 025: 1534 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=546509, ups=1.11, wpb=493934, bsz=16762.4, num_updates=42000, lr=0.000308607, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=38664 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025 | valid on 'valid' subset | loss 3.712 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.712 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 epoch 025: 1634 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=385505, ups=0.78, wpb=495536, bsz=16376.3, num_updates=42100, lr=0.00030824, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.9, wall=38793 end of epoch 25 (average epoch stats below) epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 epoch 025 | loss 3.651 | nll_loss 2.112 | ppl 4.32 | wps 534763 | ups 1.08 | wpb 495113 | bsz 16507 | num_updates 42155 | lr 0.000308039 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1480 | gb_free 23.7 | wall 38840 Start iterating over samples epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 45 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=558834, ups=1.14, wpb=491617, bsz=16486, num_updates=42200, lr=0.000307875, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=86, gb_free=22.5, wall=38881 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 145 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=556626, ups=1.12, wpb=495129, bsz=16461, num_updates=42300, lr=0.00030751, gnorm=0.175, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=38970 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 245 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=555200, ups=1.12, wpb=496394, bsz=16876.1, num_updates=42400, lr=0.000307148, gnorm=0.195, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39059 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 345 / 1689 loss=3.645, nll_loss=2.105, ppl=4.3, wps=555072, ups=1.12, wpb=494302, bsz=16762.4, num_updates=42500, lr=0.000306786, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39148 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 445 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=564567, ups=1.14, wpb=494501, bsz=16190.6, num_updates=42600, lr=0.000306426, gnorm=0.197, clip=0, loss_scale=1, train_wall=86, gb_free=21.5, wall=39236 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 545 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=554121, ups=1.12, wpb=495792, bsz=16584.5, num_updates=42700, lr=0.000306067, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=39325 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 646 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=551478, ups=1.11, wpb=495441, bsz=16036.8, num_updates=42800, lr=0.000305709, gnorm=0.184, clip=0, loss_scale=1, train_wall=88, gb_free=21, wall=39415 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 746 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=552831, ups=1.11, wpb=496308, bsz=16737.5, num_updates=42900, lr=0.000305352, gnorm=0.196, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=39505 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 epoch 026: 846 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=557506, ups=1.13, wpb=494653, bsz=16481.7, num_updates=43000, lr=0.000304997, gnorm=0.192, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=39593 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026 | valid on 'valid' subset | loss 3.703 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.703 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 946 / 1689 loss=3.65, nll_loss=2.112, ppl=4.32, wps=372010, ups=0.75, wpb=496017, bsz=16531, num_updates=43100, lr=0.000304643, gnorm=0.186, clip=0, loss_scale=1, train_wall=96, gb_free=22.3, wall=39727 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1046 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=566168, ups=1.14, wpb=495794, bsz=16335.4, num_updates=43200, lr=0.00030429, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=39814 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1146 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=563493, ups=1.14, wpb=494949, bsz=16950.2, num_updates=43300, lr=0.000303939, gnorm=0.191, clip=0, loss_scale=2, train_wall=87, gb_free=21.7, wall=39902 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1247 / 1689 loss=3.651, nll_loss=2.113, ppl=4.33, wps=558864, ups=1.13, wpb=495679, bsz=16158.7, num_updates=43400, lr=0.000303588, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=39991 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1347 / 1689 loss=3.645, nll_loss=2.107, ppl=4.31, wps=556250, ups=1.12, wpb=495146, bsz=16495, num_updates=43500, lr=0.000303239, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40080 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1447 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=559589, ups=1.13, wpb=495809, bsz=16650.9, num_updates=43600, lr=0.000302891, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=40169 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1547 / 1689 loss=3.655, nll_loss=2.118, ppl=4.34, wps=557054, ups=1.13, wpb=495090, bsz=16421.3, num_updates=43700, lr=0.000302545, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=40257 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 epoch 026: 1647 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=555863, ups=1.12, wpb=494798, bsz=16364.7, num_updates=43800, lr=0.000302199, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=20.3, wall=40346 end of epoch 26 (average epoch stats below) epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 epoch 026 | loss 3.646 | nll_loss 2.108 | ppl 4.31 | wps 541241 | ups 1.09 | wpb 495103 | bsz 16505.5 | num_updates 43842 | lr 0.000302054 | gnorm 0.189 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 23.1 | wall 40383 Start iterating over samples epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 59 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=543937, ups=1.11, wpb=490739, bsz=16061.3, num_updates=43900, lr=0.000301855, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=40437 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 epoch 027: 159 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=554919, ups=1.12, wpb=495503, bsz=16478.6, num_updates=44000, lr=0.000301511, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=40526 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027 | valid on 'valid' subset | loss 3.705 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.703 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 259 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=488189, ups=0.98, wpb=495892, bsz=16159.6, num_updates=44100, lr=0.000301169, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=40628 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 359 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=552673, ups=1.11, wpb=495967, bsz=16483.5, num_updates=44200, lr=0.000300828, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=40717 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 459 / 1689 loss=3.639, nll_loss=2.1, ppl=4.29, wps=551485, ups=1.11, wpb=496200, bsz=16334.1, num_updates=44300, lr=0.000300489, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=40807 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 560 / 1689 loss=3.639, nll_loss=2.099, ppl=4.28, wps=548805, ups=1.1, wpb=496726, bsz=16318.1, num_updates=44400, lr=0.00030015, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=40898 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 660 / 1689 loss=3.639, nll_loss=2.099, ppl=4.29, wps=551432, ups=1.12, wpb=493414, bsz=16693.3, num_updates=44500, lr=0.000299813, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=40987 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 760 / 1689 loss=3.643, nll_loss=2.104, ppl=4.3, wps=554545, ups=1.12, wpb=494193, bsz=16775, num_updates=44600, lr=0.000299476, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=41076 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 860 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=557991, ups=1.13, wpb=494394, bsz=16721.3, num_updates=44700, lr=0.000299141, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=41165 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 960 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553418, ups=1.12, wpb=495388, bsz=16241.4, num_updates=44800, lr=0.000298807, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=41254 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1060 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=550861, ups=1.11, wpb=494190, bsz=16852, num_updates=44900, lr=0.000298474, gnorm=0.18, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=41344 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 epoch 027: 1161 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=548852, ups=1.11, wpb=495138, bsz=16965.9, num_updates=45000, lr=0.000298142, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=41434 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.139 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.7 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1261 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=457995, ups=0.92, wpb=496004, bsz=16372, num_updates=45100, lr=0.000297812, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=41543 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1361 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=555720, ups=1.12, wpb=495699, bsz=16367.7, num_updates=45200, lr=0.000297482, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=20.3, wall=41632 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1461 / 1689 loss=3.648, nll_loss=2.11, ppl=4.32, wps=553961, ups=1.12, wpb=496005, bsz=16551.2, num_updates=45300, lr=0.000297154, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=41721 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1561 / 1689 loss=3.649, nll_loss=2.111, ppl=4.32, wps=556982, ups=1.12, wpb=495130, bsz=16369.9, num_updates=45400, lr=0.000296826, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=41810 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 epoch 027: 1661 / 1689 loss=3.645, nll_loss=2.106, ppl=4.31, wps=553784, ups=1.12, wpb=495796, bsz=16809.9, num_updates=45500, lr=0.0002965, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41900 end of epoch 27 (average epoch stats below) epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 epoch 027 | loss 3.642 | nll_loss 2.103 | ppl 4.3 | wps 541937 | ups 1.09 | wpb 495129 | bsz 16508.2 | num_updates 45528 | lr 0.000296409 | gnorm 0.185 | clip 0 | loss_scale 2 | train_wall 1482 | gb_free 23.2 | wall 41924 Start iterating over samples epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 73 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=551493, ups=1.12, wpb=490800, bsz=16529.3, num_updates=45600, lr=0.000296174, gnorm=0.193, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=41989 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 173 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=557221, ups=1.12, wpb=497869, bsz=16403.3, num_updates=45700, lr=0.00029585, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=42078 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 273 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=550371, ups=1.11, wpb=494642, bsz=16497.5, num_updates=45800, lr=0.000295527, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=42168 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 374 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=544206, ups=1.1, wpb=494326, bsz=16589, num_updates=45900, lr=0.000295205, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=89, gb_free=23.1, wall=42259 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 epoch 028: 474 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=553949, ups=1.12, wpb=496633, bsz=16726.8, num_updates=46000, lr=0.000294884, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=42349 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.71 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.7 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 574 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=482063, ups=0.98, wpb=494394, bsz=17136.8, num_updates=46100, lr=0.000294564, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.9, wall=42451 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 674 / 1689 loss=3.644, nll_loss=2.105, ppl=4.3, wps=560146, ups=1.13, wpb=495817, bsz=16426.6, num_updates=46200, lr=0.000294245, gnorm=0.189, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.2, wall=42540 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 774 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=562143, ups=1.13, wpb=496473, bsz=16421.9, num_updates=46300, lr=0.000293927, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.9, wall=42628 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 874 / 1689 loss=3.638, nll_loss=2.099, ppl=4.28, wps=561966, ups=1.13, wpb=495926, bsz=16407.6, num_updates=46400, lr=0.00029361, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=42716 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 974 / 1689 loss=3.643, nll_loss=2.105, ppl=4.3, wps=562889, ups=1.14, wpb=495586, bsz=16445, num_updates=46500, lr=0.000293294, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=42804 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1074 / 1689 loss=3.633, nll_loss=2.092, ppl=4.26, wps=560748, ups=1.13, wpb=495507, bsz=16236.6, num_updates=46600, lr=0.000292979, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=42893 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1174 / 1689 loss=3.636, nll_loss=2.097, ppl=4.28, wps=565738, ups=1.14, wpb=497265, bsz=16305.9, num_updates=46700, lr=0.000292666, gnorm=0.195, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=42981 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1274 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=554269, ups=1.12, wpb=494477, bsz=16327.9, num_updates=46800, lr=0.000292353, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=43070 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1374 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=560824, ups=1.13, wpb=496828, bsz=16667.5, num_updates=46900, lr=0.000292041, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43158 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 epoch 028: 1475 / 1689 loss=3.646, nll_loss=2.107, ppl=4.31, wps=552851, ups=1.12, wpb=494406, bsz=16487.5, num_updates=47000, lr=0.00029173, gnorm=0.188, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=43248 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028 | valid on 'valid' subset | loss 3.72 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.7 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1575 / 1689 loss=3.646, nll_loss=2.108, ppl=4.31, wps=485298, ups=0.98, wpb=494216, bsz=16398.3, num_updates=47100, lr=0.00029142, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.7, wall=43350 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 epoch 028: 1675 / 1689 loss=3.65, nll_loss=2.113, ppl=4.32, wps=554572, ups=1.12, wpb=492955, bsz=16847.4, num_updates=47200, lr=0.000291111, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=43439 end of epoch 28 (average epoch stats below) epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 epoch 028 | loss 3.638 | nll_loss 2.099 | ppl 4.28 | wps 546923 | ups 1.1 | wpb 495129 | bsz 16508.7 | num_updates 47214 | lr 0.000291068 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1479 | gb_free 22.6 | wall 43450 Start iterating over samples epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 86 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=554766, ups=1.13, wpb=491052, bsz=16512.5, num_updates=47300, lr=0.000290803, gnorm=0.189, clip=0, loss_scale=1, train_wall=86, gb_free=22.7, wall=43527 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 186 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=564767, ups=1.14, wpb=495170, bsz=16774.2, num_updates=47400, lr=0.000290496, gnorm=0.174, clip=0, loss_scale=1, train_wall=87, gb_free=21.5, wall=43615 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 286 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=566475, ups=1.14, wpb=495228, bsz=16331.4, num_updates=47500, lr=0.000290191, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=43702 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 387 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560380, ups=1.13, wpb=494117, bsz=16084.9, num_updates=47600, lr=0.000289886, gnorm=0.189, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=43790 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 487 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=558873, ups=1.13, wpb=495858, bsz=16671.8, num_updates=47700, lr=0.000289581, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=43879 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 587 / 1689 loss=3.631, nll_loss=2.09, ppl=4.26, wps=553662, ups=1.12, wpb=494773, bsz=16812.2, num_updates=47800, lr=0.000289278, gnorm=0.186, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=43968 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 688 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=549213, ups=1.11, wpb=496786, bsz=16512.6, num_updates=47900, lr=0.000288976, gnorm=0.193, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=44059 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 epoch 029: 788 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=557137, ups=1.12, wpb=496334, bsz=16461, num_updates=48000, lr=0.000288675, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=44148 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029 | valid on 'valid' subset | loss 3.709 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.7 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 888 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=485957, ups=0.99, wpb=493193, bsz=16666.9, num_updates=48100, lr=0.000288375, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=44249 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 988 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=558202, ups=1.13, wpb=493069, bsz=16522.4, num_updates=48200, lr=0.000288076, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=44338 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1089 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=548379, ups=1.11, wpb=496260, bsz=16407.6, num_updates=48300, lr=0.000287777, gnorm=0.184, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44428 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1189 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=559870, ups=1.13, wpb=496346, bsz=16745.4, num_updates=48400, lr=0.00028748, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=44517 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1289 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=558723, ups=1.13, wpb=496383, bsz=16581, num_updates=48500, lr=0.000287183, gnorm=0.179, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.7, wall=44606 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1389 / 1689 loss=3.647, nll_loss=2.108, ppl=4.31, wps=557004, ups=1.12, wpb=496213, bsz=16381.2, num_updates=48600, lr=0.000286888, gnorm=0.18, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.3, wall=44695 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1489 / 1689 loss=3.635, nll_loss=2.095, ppl=4.27, wps=547352, ups=1.11, wpb=494628, bsz=16468.9, num_updates=48700, lr=0.000286593, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=89, gb_free=21.7, wall=44785 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1589 / 1689 loss=3.642, nll_loss=2.103, ppl=4.3, wps=550718, ups=1.11, wpb=494445, bsz=16298.2, num_updates=48800, lr=0.000286299, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=44875 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 epoch 029: 1689 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=557066, ups=1.13, wpb=492600, bsz=16170.1, num_updates=48900, lr=0.000286006, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=87, gb_free=25.8, wall=44963 end of epoch 29 (average epoch stats below) epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 epoch 029 | loss 3.634 | nll_loss 2.094 | ppl 4.27 | wps 551570 | ups 1.11 | wpb 495110 | bsz 16505.4 | num_updates 48900 | lr 0.000286006 | gnorm 0.184 | clip 0 | loss_scale 0.5 | train_wall 1477 | gb_free 25.8 | wall 44963 Start iterating over samples epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 epoch 030: 100 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551647, ups=1.11, wpb=494889, bsz=16543.8, num_updates=49000, lr=0.000285714, gnorm=0.195, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.3, wall=45053 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.698 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.698 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 200 / 1689 loss=3.621, nll_loss=2.079, ppl=4.23, wps=358332, ups=0.72, wpb=495396, bsz=16278.4, num_updates=49100, lr=0.000285423, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=113, gb_free=21.7, wall=45191 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 300 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=562575, ups=1.13, wpb=495965, bsz=16213.8, num_updates=49200, lr=0.000285133, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=45280 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 400 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=559320, ups=1.13, wpb=494729, bsz=16497, num_updates=49300, lr=0.000284844, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=45368 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 500 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558067, ups=1.12, wpb=496541, bsz=16768.9, num_updates=49400, lr=0.000284555, gnorm=0.188, clip=0, loss_scale=1, train_wall=87, gb_free=22.9, wall=45457 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 600 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=554713, ups=1.12, wpb=496114, bsz=16518.6, num_updates=49500, lr=0.000284268, gnorm=0.194, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=45546 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 700 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=557368, ups=1.13, wpb=495294, bsz=16461.4, num_updates=49600, lr=0.000283981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=21.7, wall=45635 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 800 / 1689 loss=3.635, nll_loss=2.096, ppl=4.27, wps=560015, ups=1.13, wpb=496257, bsz=16346.6, num_updates=49700, lr=0.000283695, gnorm=0.183, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=45724 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 900 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554831, ups=1.12, wpb=494923, bsz=16430.3, num_updates=49800, lr=0.00028341, gnorm=0.18, clip=0, loss_scale=2, train_wall=87, gb_free=22, wall=45813 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1001 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=554704, ups=1.12, wpb=494809, bsz=16585.6, num_updates=49900, lr=0.000283126, gnorm=0.179, clip=0, loss_scale=1, train_wall=87, gb_free=22.6, wall=45902 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 epoch 030: 1101 / 1689 loss=3.637, nll_loss=2.098, ppl=4.28, wps=554982, ups=1.12, wpb=494862, bsz=16600.7, num_updates=50000, lr=0.000282843, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=45992 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.698 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1201 / 1689 loss=3.64, nll_loss=2.101, ppl=4.29, wps=482409, ups=0.98, wpb=494441, bsz=16925.5, num_updates=50100, lr=0.00028256, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=46094 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1302 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=549138, ups=1.11, wpb=494105, bsz=16311.3, num_updates=50200, lr=0.000282279, gnorm=0.188, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=46184 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1402 / 1689 loss=3.634, nll_loss=2.095, ppl=4.27, wps=555468, ups=1.12, wpb=495805, bsz=16661.3, num_updates=50300, lr=0.000281998, gnorm=0.192, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=46273 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1502 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=556861, ups=1.12, wpb=495864, bsz=16654.3, num_updates=50400, lr=0.000281718, gnorm=0.179, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=46362 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 epoch 030: 1602 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=552020, ups=1.11, wpb=496194, bsz=16500.3, num_updates=50500, lr=0.000281439, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=46452 end of epoch 30 (average epoch stats below) epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 epoch 030 | loss 3.631 | nll_loss 2.091 | ppl 4.26 | wps 533446 | ups 1.08 | wpb 495124 | bsz 16507.3 | num_updates 50587 | lr 0.000281197 | gnorm 0.186 | clip 0 | loss_scale 0.5 | train_wall 1501 | gb_free 23.8 | wall 46529 Start iterating over samples epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 13 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=544489, ups=1.11, wpb=490940, bsz=16352.8, num_updates=50600, lr=0.000281161, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=46542 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 113 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=555758, ups=1.12, wpb=494863, bsz=16403.4, num_updates=50700, lr=0.000280883, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=46631 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 213 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=559583, ups=1.13, wpb=495008, bsz=16661.3, num_updates=50800, lr=0.000280607, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=46720 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 313 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556616, ups=1.12, wpb=495583, bsz=16250.6, num_updates=50900, lr=0.000280331, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=46809 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 epoch 031: 413 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=550168, ups=1.11, wpb=493470, bsz=16421.3, num_updates=51000, lr=0.000280056, gnorm=0.191, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=46899 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.713 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.698 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 513 / 1689 loss=3.619, nll_loss=2.077, ppl=4.22, wps=491844, ups=0.99, wpb=494777, bsz=16783.8, num_updates=51100, lr=0.000279782, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=46999 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 613 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=556643, ups=1.12, wpb=495303, bsz=16515, num_updates=51200, lr=0.000279508, gnorm=0.192, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=47088 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 714 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=549957, ups=1.11, wpb=496130, bsz=16652.2, num_updates=51300, lr=0.000279236, gnorm=0.188, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=47178 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 814 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=554568, ups=1.12, wpb=496338, bsz=16337.3, num_updates=51400, lr=0.000278964, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47268 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 914 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560052, ups=1.13, wpb=495739, bsz=16429.5, num_updates=51500, lr=0.000278693, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.5, wall=47356 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1014 / 1689 loss=3.632, nll_loss=2.092, ppl=4.26, wps=554258, ups=1.12, wpb=495035, bsz=16707.4, num_updates=51600, lr=0.000278423, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47446 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1114 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=560314, ups=1.13, wpb=495875, bsz=15970.8, num_updates=51700, lr=0.000278154, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=47534 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1215 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=548729, ups=1.11, wpb=493469, bsz=16728.1, num_updates=51800, lr=0.000277885, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=20.2, wall=47624 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1315 / 1689 loss=3.636, nll_loss=2.096, ppl=4.28, wps=556537, ups=1.12, wpb=495600, bsz=16308.6, num_updates=51900, lr=0.000277617, gnorm=0.19, clip=0, loss_scale=1, train_wall=87, gb_free=22.2, wall=47713 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 epoch 031: 1415 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558397, ups=1.13, wpb=495014, bsz=16912.9, num_updates=52000, lr=0.00027735, gnorm=0.178, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=47802 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031 | valid on 'valid' subset | loss 3.71 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.698 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1515 / 1689 loss=3.637, nll_loss=2.097, ppl=4.28, wps=492061, ups=0.99, wpb=496827, bsz=16807.2, num_updates=52100, lr=0.000277084, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=47903 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 epoch 031: 1615 / 1689 loss=3.63, nll_loss=2.091, ppl=4.26, wps=562558, ups=1.14, wpb=495338, bsz=16354.3, num_updates=52200, lr=0.000276818, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=47991 end of epoch 31 (average epoch stats below) epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 epoch 031 | loss 3.627 | nll_loss 2.087 | ppl 4.25 | wps 546940 | ups 1.1 | wpb 495118 | bsz 16506.8 | num_updates 52273 | lr 0.000276625 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1478 | gb_free 22.9 | wall 48056 Start iterating over samples epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 27 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=550968, ups=1.12, wpb=492958, bsz=16579.8, num_updates=52300, lr=0.000276553, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=48080 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 127 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=555459, ups=1.12, wpb=494354, bsz=16750.3, num_updates=52400, lr=0.000276289, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=21, wall=48169 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 227 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=558229, ups=1.13, wpb=494644, bsz=16585.4, num_updates=52500, lr=0.000276026, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48258 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 327 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=559951, ups=1.13, wpb=496175, bsz=16535.8, num_updates=52600, lr=0.000275764, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.1, wall=48347 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 427 / 1689 loss=3.617, nll_loss=2.076, ppl=4.22, wps=556205, ups=1.12, wpb=496674, bsz=16579, num_updates=52700, lr=0.000275502, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.6, wall=48436 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 527 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=548417, ups=1.11, wpb=494831, bsz=16773, num_updates=52800, lr=0.000275241, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.3, wall=48526 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 627 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560758, ups=1.13, wpb=495898, bsz=16484, num_updates=52900, lr=0.000274981, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22.3, wall=48615 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 epoch 032: 727 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=560882, ups=1.14, wpb=493686, bsz=16592.1, num_updates=53000, lr=0.000274721, gnorm=0.191, clip=0, loss_scale=1, train_wall=86, gb_free=22.5, wall=48703 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032 | valid on 'valid' subset | loss 3.709 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.698 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 827 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=379168, ups=0.77, wpb=495253, bsz=16076.6, num_updates=53100, lr=0.000274462, gnorm=0.176, clip=0, loss_scale=1, train_wall=85, gb_free=22.1, wall=48833 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 928 / 1689 loss=3.625, nll_loss=2.084, ppl=4.24, wps=562392, ups=1.13, wpb=495934, bsz=16473, num_updates=53200, lr=0.000274204, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=48921 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1028 / 1689 loss=3.627, nll_loss=2.087, ppl=4.25, wps=560653, ups=1.13, wpb=496195, bsz=16078.9, num_updates=53300, lr=0.000273947, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49010 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1128 / 1689 loss=3.63, nll_loss=2.09, ppl=4.26, wps=558629, ups=1.13, wpb=494078, bsz=16485, num_updates=53400, lr=0.00027369, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=49098 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1228 / 1689 loss=3.629, nll_loss=2.088, ppl=4.25, wps=562861, ups=1.14, wpb=494986, bsz=16190.6, num_updates=53500, lr=0.000273434, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=49186 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1328 / 1689 loss=3.623, nll_loss=2.082, ppl=4.23, wps=560235, ups=1.13, wpb=496410, bsz=16174.2, num_updates=53600, lr=0.000273179, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.6, wall=49275 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1428 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=555411, ups=1.12, wpb=494139, bsz=16799.1, num_updates=53700, lr=0.000272925, gnorm=0.186, clip=0, loss_scale=1, train_wall=87, gb_free=21.6, wall=49364 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1528 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=557140, ups=1.12, wpb=497226, bsz=16667, num_updates=53800, lr=0.000272671, gnorm=0.191, clip=0, loss_scale=1, train_wall=88, gb_free=22.3, wall=49453 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 epoch 032: 1628 / 1689 loss=3.633, nll_loss=2.094, ppl=4.27, wps=548779, ups=1.11, wpb=494523, bsz=16558.1, num_updates=53900, lr=0.000272418, gnorm=0.178, clip=0, loss_scale=1, train_wall=88, gb_free=22.4, wall=49543 end of epoch 32 (average epoch stats below) epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 epoch 032 | loss 3.624 | nll_loss 2.083 | ppl 4.24 | wps 542289 | ups 1.1 | wpb 495108 | bsz 16506.8 | num_updates 53961 | lr 0.000272264 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1474 | gb_free 22.8 | wall 49597 Start iterating over samples epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 epoch 033: 40 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=541936, ups=1.1, wpb=491677, bsz=16418, num_updates=54000, lr=0.000272166, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=49634 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.696 | nll_loss 2.134 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.696 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 140 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=406982, ups=0.82, wpb=495064, bsz=16574.6, num_updates=54100, lr=0.000271914, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=98, gb_free=21.8, wall=49756 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 240 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=556905, ups=1.12, wpb=496839, bsz=16990.7, num_updates=54200, lr=0.000271663, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=49845 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 340 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=556690, ups=1.13, wpb=494061, bsz=16293.9, num_updates=54300, lr=0.000271413, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=49934 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 440 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=551230, ups=1.11, wpb=494692, bsz=16437.5, num_updates=54400, lr=0.000271163, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=50023 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 540 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=554184, ups=1.12, wpb=496080, bsz=16635, num_updates=54500, lr=0.000270914, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=50113 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 641 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=547956, ups=1.11, wpb=495209, bsz=16699.9, num_updates=54600, lr=0.000270666, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=89, gb_free=20.9, wall=50203 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 741 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=559957, ups=1.13, wpb=494146, bsz=16349.8, num_updates=54700, lr=0.000270418, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=86, gb_free=21.8, wall=50292 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 841 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=557094, ups=1.13, wpb=494478, bsz=16653.6, num_updates=54800, lr=0.000270172, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.9, wall=50380 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 941 / 1689 loss=3.615, nll_loss=2.073, ppl=4.21, wps=557770, ups=1.12, wpb=496225, bsz=16033.8, num_updates=54900, lr=0.000269925, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=50469 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 epoch 033: 1041 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=555203, ups=1.12, wpb=495223, bsz=16588.2, num_updates=55000, lr=0.00026968, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.2, wall=50558 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033 | valid on 'valid' subset | loss 3.7 | nll_loss 2.143 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.696 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1141 / 1689 loss=3.629, nll_loss=2.089, ppl=4.25, wps=493094, ups=1, wpb=494394, bsz=16440.5, num_updates=55100, lr=0.000269435, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50659 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1241 / 1689 loss=3.633, nll_loss=2.093, ppl=4.27, wps=557975, ups=1.13, wpb=495740, bsz=17049.2, num_updates=55200, lr=0.000269191, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=50748 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1341 / 1689 loss=3.626, nll_loss=2.086, ppl=4.24, wps=557433, ups=1.12, wpb=496208, bsz=16315.7, num_updates=55300, lr=0.000268947, gnorm=0.183, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=50837 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1441 / 1689 loss=3.632, nll_loss=2.093, ppl=4.26, wps=558788, ups=1.13, wpb=496367, bsz=16800.6, num_updates=55400, lr=0.000268705, gnorm=0.181, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=50925 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1542 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=549896, ups=1.11, wpb=495194, bsz=16671.4, num_updates=55500, lr=0.000268462, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=51015 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 epoch 033: 1642 / 1689 loss=3.626, nll_loss=2.086, ppl=4.25, wps=559281, ups=1.13, wpb=495384, bsz=16061.1, num_updates=55600, lr=0.000268221, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=51104 end of epoch 33 (average epoch stats below) epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 epoch 033 | loss 3.621 | nll_loss 2.08 | ppl 4.23 | wps 539228 | ups 1.09 | wpb 495127 | bsz 16505.1 | num_updates 55647 | lr 0.000268108 | gnorm 0.182 | clip 0 | loss_scale 0.5 | train_wall 1487 | gb_free 24.1 | wall 51145 Start iterating over samples epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 53 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=542124, ups=1.1, wpb=491078, bsz=16341, num_updates=55700, lr=0.00026798, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.4, wall=51195 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 153 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=552475, ups=1.11, wpb=496226, bsz=16772.1, num_updates=55800, lr=0.00026774, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.9, wall=51284 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 253 / 1689 loss=3.607, nll_loss=2.064, ppl=4.18, wps=559393, ups=1.13, wpb=496394, bsz=16040.3, num_updates=55900, lr=0.0002675, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.9, wall=51373 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 epoch 034: 353 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=557210, ups=1.12, wpb=495389, bsz=16385.4, num_updates=56000, lr=0.000267261, gnorm=0.17, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51462 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.702 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.696 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 453 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=486882, ups=0.99, wpb=494189, bsz=16576.8, num_updates=56100, lr=0.000267023, gnorm=0.175, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51564 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 553 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=555175, ups=1.12, wpb=496260, bsz=16457.3, num_updates=56200, lr=0.000266785, gnorm=0.187, clip=0, loss_scale=1, train_wall=87, gb_free=22.1, wall=51653 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 653 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=558296, ups=1.13, wpb=494808, bsz=16728.3, num_updates=56300, lr=0.000266548, gnorm=0.194, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=51742 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 754 / 1689 loss=3.617, nll_loss=2.076, ppl=4.21, wps=553426, ups=1.11, wpb=496389, bsz=16548.3, num_updates=56400, lr=0.000266312, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.9, wall=51831 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 854 / 1689 loss=3.614, nll_loss=2.072, ppl=4.21, wps=556440, ups=1.12, wpb=494626, bsz=16515.2, num_updates=56500, lr=0.000266076, gnorm=0.175, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.8, wall=51920 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 954 / 1689 loss=3.62, nll_loss=2.079, ppl=4.22, wps=557138, ups=1.13, wpb=493620, bsz=16575.1, num_updates=56600, lr=0.000265841, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.7, wall=52009 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1054 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=558828, ups=1.13, wpb=495153, bsz=16154, num_updates=56700, lr=0.000265606, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22, wall=52097 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1154 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=556971, ups=1.13, wpb=494898, bsz=16469.9, num_updates=56800, lr=0.000265372, gnorm=0.181, clip=0, loss_scale=0.5, train_wall=87, gb_free=20.5, wall=52186 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1255 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=555469, ups=1.12, wpb=495691, bsz=16550.2, num_updates=56900, lr=0.000265139, gnorm=0.176, clip=0, loss_scale=0.5, train_wall=88, gb_free=19.3, wall=52276 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 epoch 034: 1355 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=551874, ups=1.11, wpb=496686, bsz=16731.4, num_updates=57000, lr=0.000264906, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=52366 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034 | valid on 'valid' subset | loss 3.705 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.696 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1455 / 1689 loss=3.628, nll_loss=2.088, ppl=4.25, wps=493390, ups=1, wpb=495716, bsz=16482.9, num_updates=57100, lr=0.000264674, gnorm=0.187, clip=0, loss_scale=0.5, train_wall=87, gb_free=22.2, wall=52466 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1555 / 1689 loss=3.621, nll_loss=2.081, ppl=4.23, wps=554700, ups=1.12, wpb=493845, bsz=16514.9, num_updates=57200, lr=0.000264443, gnorm=0.178, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.6, wall=52555 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 epoch 034: 1655 / 1689 loss=3.618, nll_loss=2.077, ppl=4.22, wps=551534, ups=1.11, wpb=495525, bsz=16426.5, num_updates=57300, lr=0.000264212, gnorm=0.177, clip=0, loss_scale=0.5, train_wall=88, gb_free=22, wall=52645 end of epoch 34 (average epoch stats below) epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 epoch 034 | loss 3.618 | nll_loss 2.076 | ppl 4.22 | wps 546060 | ups 1.1 | wpb 495106 | bsz 16498.4 | num_updates 57334 | lr 0.000264134 | gnorm 0.181 | clip 0 | loss_scale 0.5 | train_wall 1475 | gb_free 22.4 | wall 52674 Start iterating over samples epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 66 / 1689 loss=3.609, nll_loss=2.067, ppl=4.19, wps=375089, ups=0.76, wpb=492068, bsz=16320.4, num_updates=57400, lr=0.000263982, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=109, gb_free=22, wall=52776 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 166 / 1689 loss=3.605, nll_loss=2.062, ppl=4.17, wps=561358, ups=1.14, wpb=493920, bsz=16336.5, num_updates=57500, lr=0.000263752, gnorm=0.182, clip=0, loss_scale=1, train_wall=87, gb_free=21.4, wall=52864 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 267 / 1689 loss=3.607, nll_loss=2.065, ppl=4.18, wps=557554, ups=1.12, wpb=496873, bsz=16478.3, num_updates=57600, lr=0.000263523, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.1, wall=52953 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 368 / 1689 loss=3.606, nll_loss=2.063, ppl=4.18, wps=552864, ups=1.11, wpb=496193, bsz=16525.6, num_updates=57700, lr=0.000263295, gnorm=0.186, clip=0, loss_scale=0.25, train_wall=89, gb_free=22, wall=53043 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 468 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=560321, ups=1.13, wpb=494099, bsz=16433.3, num_updates=57800, lr=0.000263067, gnorm=0.181, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.1, wall=53131 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 568 / 1689 loss=3.614, nll_loss=2.072, ppl=4.2, wps=563242, ups=1.14, wpb=495664, bsz=16266.1, num_updates=57900, lr=0.00026284, gnorm=0.185, clip=0, loss_scale=0.25, train_wall=87, gb_free=21.8, wall=53219 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 epoch 035: 668 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=560337, ups=1.13, wpb=494551, bsz=16439.8, num_updates=58000, lr=0.000262613, gnorm=0.192, clip=0, loss_scale=0.25, train_wall=87, gb_free=22.5, wall=53307 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.692 | nll_loss 2.132 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.692 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 768 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=454648, ups=0.92, wpb=494794, bsz=16801, num_updates=58100, lr=0.000262387, gnorm=0.182, clip=0, loss_scale=0.25, train_wall=88, gb_free=21.6, wall=53416 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 868 / 1689 loss=3.615, nll_loss=2.074, ppl=4.21, wps=558184, ups=1.12, wpb=496354, bsz=16567, num_updates=58200, lr=0.000262161, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.5, wall=53505 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 968 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=554756, ups=1.12, wpb=495326, bsz=16649.1, num_updates=58300, lr=0.000261936, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.8, wall=53594 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1068 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=555912, ups=1.12, wpb=496108, bsz=16347.1, num_updates=58400, lr=0.000261712, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.3, wall=53684 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1168 / 1689 loss=3.62, nll_loss=2.079, ppl=4.23, wps=556954, ups=1.12, wpb=495905, bsz=16359.2, num_updates=58500, lr=0.000261488, gnorm=0.182, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.4, wall=53773 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1268 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=553422, ups=1.12, wpb=494458, bsz=16303.8, num_updates=58600, lr=0.000261265, gnorm=0.191, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.5, wall=53862 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1368 / 1689 loss=3.621, nll_loss=2.08, ppl=4.23, wps=551936, ups=1.12, wpb=493583, bsz=16437.4, num_updates=58700, lr=0.000261042, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.5, wall=53952 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1468 / 1689 loss=3.617, nll_loss=2.075, ppl=4.21, wps=552684, ups=1.11, wpb=496517, bsz=16589, num_updates=58800, lr=0.00026082, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=54041 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1568 / 1689 loss=3.622, nll_loss=2.081, ppl=4.23, wps=552342, ups=1.12, wpb=495278, bsz=16855.7, num_updates=58900, lr=0.000260599, gnorm=0.182, clip=0, loss_scale=1, train_wall=88, gb_free=22.6, wall=54131 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 epoch 035: 1668 / 1689 loss=3.624, nll_loss=2.083, ppl=4.24, wps=553576, ups=1.12, wpb=495335, bsz=16785, num_updates=59000, lr=0.000260378, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=54221 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 epoch 035 | valid on 'valid' subset | loss 3.696 | nll_loss 2.138 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.692 end of epoch 35 (average epoch stats below) epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 epoch 035 | loss 3.615 | nll_loss 2.073 | ppl 4.21 | wps 530079 | ups 1.07 | wpb 495117 | bsz 16503.8 | num_updates 59021 | lr 0.000260331 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 1499 | gb_free 23 | wall 54250 Start iterating over samples epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 79 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=480741, ups=0.98, wpb=491467, bsz=16816.6, num_updates=59100, lr=0.000260157, gnorm=0.18, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=54323 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 181 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=541137, ups=1.1, wpb=493475, bsz=16189.3, num_updates=59200, lr=0.000259938, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=89, gb_free=21.8, wall=54414 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 281 / 1689 loss=3.605, nll_loss=2.062, ppl=4.18, wps=558310, ups=1.13, wpb=495061, bsz=16300.6, num_updates=59300, lr=0.000259718, gnorm=0.186, clip=0, loss_scale=0.5, train_wall=87, gb_free=21.6, wall=54503 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 381 / 1689 loss=3.612, nll_loss=2.07, ppl=4.2, wps=556303, ups=1.12, wpb=496512, bsz=16771.5, num_updates=59400, lr=0.0002595, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54592 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 481 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=555889, ups=1.12, wpb=495644, bsz=16600.7, num_updates=59500, lr=0.000259281, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.7, wall=54681 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 581 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=553519, ups=1.12, wpb=495247, bsz=16410.5, num_updates=59600, lr=0.000259064, gnorm=0.18, clip=0, loss_scale=0.5, train_wall=88, gb_free=22.4, wall=54771 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 681 / 1689 loss=3.613, nll_loss=2.071, ppl=4.2, wps=552196, ups=1.12, wpb=494338, bsz=16190.4, num_updates=59700, lr=0.000258847, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=88, gb_free=21.4, wall=54860 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 781 / 1689 loss=3.619, nll_loss=2.078, ppl=4.22, wps=557313, ups=1.13, wpb=493734, bsz=16237.7, num_updates=59800, lr=0.00025863, gnorm=0.176, clip=0, loss_scale=1, train_wall=87, gb_free=21.9, wall=54949 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 881 / 1689 loss=3.611, nll_loss=2.069, ppl=4.2, wps=555111, ups=1.12, wpb=494524, bsz=16441.2, num_updates=59900, lr=0.000258414, gnorm=0.181, clip=0, loss_scale=1, train_wall=87, gb_free=22, wall=55038 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 epoch 036: 981 / 1689 loss=3.611, nll_loss=2.068, ppl=4.19, wps=558550, ups=1.13, wpb=495331, bsz=16449.3, num_updates=60000, lr=0.000258199, gnorm=0.176, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=55126 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 epoch 036 | valid on 'valid' subset | loss 3.69 | nll_loss 2.131 | ppl 4.38 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.69 end of epoch 36 (average epoch stats below) epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 epoch 036 | loss 3.608 | nll_loss 2.065 | ppl 4.19 | wps 541384 | ups 1.09 | wpb 494843 | bsz 16447.5 | num_updates 60000 | lr 0.000258199 | gnorm 0.182 | clip 0 | loss_scale 1 | train_wall 858 | gb_free 21.8 | wall 55145 done training in 55131.2 seconds